From 3f6ecafffdce0cf8d939d86210a7f22c5b135ff7 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 15 Jul 2025 15:10:41 -0700 Subject: [PATCH 1/5] [AMDGPU] Inflate to %av regclass Change-Id: Ied8fe81cf2c8271ca22eedbade4eb312f3fbea39 --- .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 16 + llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll | 357 ++++++++++++++++++ 2 files changed, 373 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp index 0137b3f5943d7..427922481ecca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -97,6 +97,8 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { const MCInstrDesc &AVImmPseudo32 = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); const MCInstrDesc &AVImmPseudo64 = TII.get(AMDGPU::AV_MOV_B64_IMM_PSEUDO); + const SIRegisterInfo *TRI = + static_cast(MRI.getTargetRegisterInfo()); bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -119,6 +121,20 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { Changed = true; continue; } + + for (MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + + Register DefReg = Op.getReg(); + if (DefReg.isPhysical()) + continue; + + const TargetRegisterClass *RC = MRI.getRegClass(DefReg); + + if (TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC)) + Changed |= MRI.recomputeRegClass(DefReg); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll new file mode 100644 index 0000000000000..3a534149121fb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll @@ -0,0 +1,357 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck %s + +define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 { +; CHECK-LABEL: bad_rp: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s1, s[4:5], 0x10 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: ds_read_b128 a[0:3], v0 +; CHECK-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; CHECK-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; CHECK-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; CHECK-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; CHECK-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; CHECK-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; CHECK-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; CHECK-NEXT: ds_read_b128 a[32:35], v0 offset:128 +; CHECK-NEXT: ds_read_b128 a[36:39], v0 offset:144 +; CHECK-NEXT: ds_read_b128 a[40:43], v0 offset:160 +; CHECK-NEXT: ds_read_b128 a[44:47], v0 offset:176 +; CHECK-NEXT: ds_read_b128 a[48:51], v0 offset:192 +; CHECK-NEXT: ds_read_b128 a[52:55], v0 offset:208 +; CHECK-NEXT: ds_read_b128 a[56:59], v0 offset:224 +; CHECK-NEXT: ds_read_b128 a[60:63], v0 offset:240 +; CHECK-NEXT: s_bitcmp1_b32 s1, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; CHECK-NEXT: .LBB0_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(14) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255] +; CHECK-NEXT: s_waitcnt lgkmcnt(13) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239] +; CHECK-NEXT: s_waitcnt lgkmcnt(12) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223] +; CHECK-NEXT: s_waitcnt lgkmcnt(11) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207] +; CHECK-NEXT: s_waitcnt lgkmcnt(10) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191] +; CHECK-NEXT: s_waitcnt lgkmcnt(9) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175] +; CHECK-NEXT: s_waitcnt lgkmcnt(8) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159] +; CHECK-NEXT: s_waitcnt lgkmcnt(7) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143] +; CHECK-NEXT: s_waitcnt lgkmcnt(6) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127] +; CHECK-NEXT: s_waitcnt lgkmcnt(5) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111] +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95] +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[168:169], s[0:1] +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[240:243] +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480 +; CHECK-NEXT: s_endpgm + %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 + %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 + %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 + %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16 + %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20 + %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24 + %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28 + %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32 + %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36 + %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40 + %gep11 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 44 + %gep12 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 48 + %gep13 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 52 + %gep14 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 56 + %gep15 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 60 + %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16 + %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16 + %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16 + %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16 + %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16 + %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16 + %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16 + %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16 + %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16 + %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16 + %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16 + %load11 = load <8 x half>, ptr addrspace(3) %gep11, align 16 + %load12 = load <8 x half>, ptr addrspace(3) %gep12, align 16 + %load13 = load <8 x half>, ptr addrspace(3) %gep13, align 16 + %load14 = load <8 x half>, ptr addrspace(3) %gep14, align 16 + %load15 = load <8 x half>, ptr addrspace(3) %gep15, align 16 + br label %bb.1 + +bb.1: + %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0) + %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0) + %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0) + %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0) + %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0) + %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0) + %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0) + %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0) + %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0) + %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0) + %mfma11 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load11, <8 x half> %load11, <16 x float> %mfma10, i32 0, i32 0, i32 0) + %mfma12 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load12, <8 x half> %load12, <16 x float> %mfma11, i32 0, i32 0, i32 0) + %mfma13 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load13, <8 x half> %load13, <16 x float> %mfma12, i32 0, i32 0, i32 0) + %mfma14 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load14, <8 x half> %load14, <16 x float> %mfma13, i32 0, i32 0, i32 0) + %mfma15 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load15, <8 x half> %load15, <16 x float> %mfma14, i32 0, i32 0, i32 0) + br i1 %cond, label %bb.1, label %bb.2 + +bb.2: + %out1 = getelementptr ptr, ptr %out, i32 4 + %out2 = getelementptr ptr, ptr %out, i32 8 + %out3 = getelementptr ptr, ptr %out, i32 12 + %out4 = getelementptr ptr, ptr %out, i32 16 + %out5 = getelementptr ptr, ptr %out, i32 20 + %out6 = getelementptr ptr, ptr %out, i32 24 + %out7 = getelementptr ptr, ptr %out, i32 28 + %out8 = getelementptr ptr, ptr %out, i32 32 + %out9 = getelementptr ptr, ptr %out, i32 36 + %out10 = getelementptr ptr, ptr %out, i32 40 + %out11 = getelementptr ptr, ptr %out, i32 44 + %out12 = getelementptr ptr, ptr %out, i32 48 + %out13 = getelementptr ptr, ptr %out, i32 52 + %out14 = getelementptr ptr, ptr %out, i32 56 + %out15 = getelementptr ptr, ptr %out, i32 60 + store <16 x float> %mfma0, ptr addrspace(0) %out + store <16 x float> %mfma1, ptr addrspace(0) %out1 + store <16 x float> %mfma2, ptr addrspace(0) %out2 + store <16 x float> %mfma3, ptr addrspace(0) %out3 + store <16 x float> %mfma4, ptr addrspace(0) %out4 + store <16 x float> %mfma5, ptr addrspace(0) %out5 + store <16 x float> %mfma6, ptr addrspace(0) %out6 + store <16 x float> %mfma7, ptr addrspace(0) %out7 + store <16 x float> %mfma8, ptr addrspace(0) %out8 + store <16 x float> %mfma9, ptr addrspace(0) %out9 + store <16 x float> %mfma10, ptr addrspace(0) %out10 + store <16 x float> %mfma11, ptr addrspace(0) %out11 + store <16 x float> %mfma12, ptr addrspace(0) %out12 + store <16 x float> %mfma13, ptr addrspace(0) %out13 + store <16 x float> %mfma14, ptr addrspace(0) %out14 + store <16 x float> %mfma15, ptr addrspace(0) %out15 + ret void +} + +define amdgpu_kernel void @good_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 { +; CHECK-LABEL: good_rp: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 +; CHECK-NEXT: s_load_dword s1, s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s1 +; CHECK-NEXT: ds_read_b128 v[176:179], v0 +; CHECK-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; CHECK-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; CHECK-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; CHECK-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; CHECK-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; CHECK-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; CHECK-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; CHECK-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; CHECK-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; CHECK-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: .LBB1_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(10) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(9) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; CHECK-NEXT: s_waitcnt lgkmcnt(8) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; CHECK-NEXT: s_waitcnt lgkmcnt(7) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; CHECK-NEXT: s_waitcnt lgkmcnt(6) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; CHECK-NEXT: s_waitcnt lgkmcnt(5) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; CHECK-NEXT: s_endpgm + %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 + %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 + %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 + %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16 + %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20 + %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24 + %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28 + %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32 + %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36 + %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40 + %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16 + %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16 + %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16 + %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16 + %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16 + %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16 + %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16 + %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16 + %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16 + %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16 + %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16 + br label %bb.1 + +bb.1: + %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0) + %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0) + %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0) + %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0) + %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0) + %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0) + %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0) + %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0) + %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0) + %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0) + br i1 %cond, label %bb.1, label %bb.2 + +bb.2: + %out1 = getelementptr ptr, ptr %out, i32 4 + %out2 = getelementptr ptr, ptr %out, i32 8 + %out3 = getelementptr ptr, ptr %out, i32 12 + %out4 = getelementptr ptr, ptr %out, i32 16 + %out5 = getelementptr ptr, ptr %out, i32 20 + %out6 = getelementptr ptr, ptr %out, i32 24 + %out7 = getelementptr ptr, ptr %out, i32 28 + %out8 = getelementptr ptr, ptr %out, i32 32 + %out9 = getelementptr ptr, ptr %out, i32 36 + %out10 = getelementptr ptr, ptr %out, i32 40 + store <16 x float> %mfma0, ptr addrspace(0) %out + store <16 x float> %mfma1, ptr addrspace(0) %out1 + store <16 x float> %mfma2, ptr addrspace(0) %out2 + store <16 x float> %mfma3, ptr addrspace(0) %out3 + store <16 x float> %mfma4, ptr addrspace(0) %out4 + store <16 x float> %mfma5, ptr addrspace(0) %out5 + store <16 x float> %mfma6, ptr addrspace(0) %out6 + store <16 x float> %mfma7, ptr addrspace(0) %out7 + store <16 x float> %mfma8, ptr addrspace(0) %out8 + store <16 x float> %mfma9, ptr addrspace(0) %out9 + store <16 x float> %mfma10, ptr addrspace(0) %out10 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1" } From 6d5273761c2659ecaf8f453f8c9def032aed145e Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 25 Sep 2025 17:25:04 -0700 Subject: [PATCH 2/5] Update lit tests Change-Id: I5a6da22ff34debbc677973453d038c86d32d0ad0 --- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 144 +- llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll | 406 +- .../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 81 +- .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 3678 +++--- .../AMDGPU/a-v-global-atomic-cmpxchg.ll | 28 +- .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 1070 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 12 +- llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 680 +- llvm/test/CodeGen/AMDGPU/agpr-remat.ll | 16 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 30 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 30 +- .../AMDGPU/buffer-fat-pointers-memcpy.ll | 307 +- llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 422 +- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 4 - .../CodeGen/AMDGPU/global-i16-load-store.ll | 12 +- .../AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 7 +- .../AMDGPU/lds-dma-workgroup-release.ll | 24 +- .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 8 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 12 +- .../llvm.amdgcn.image.atomic.dim.gfx90a.ll | 34 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll | 96 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 80 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 476 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 146 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1236 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll | 41 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 1496 +-- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3688 ++++-- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 10697 ++++++++++------ .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 2436 ++-- ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 - ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 16 +- ...uffer-fat-pointers-nontemporal-metadata.ll | 24 +- llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 2000 ++- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 1373 +- .../AMDGPU/mfma-no-register-aliasing.ll | 604 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 51 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 26 +- .../CodeGen/AMDGPU/no-fold-accvgpr-read.mir | 4 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 2 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 2 - .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 119 +- .../AMDGPU/shufflevector-physreg-copy.ll | 12 +- .../AMDGPU/shufflevector.v2f32.v3f32.ll | 28 +- .../AMDGPU/shufflevector.v2i32.v3i32.ll | 28 +- .../AMDGPU/shufflevector.v2i64.v2i64.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 28 +- .../AMDGPU/shufflevector.v3f32.v2f32.ll | 391 +- .../AMDGPU/shufflevector.v3f32.v3f32.ll | 200 +- .../AMDGPU/shufflevector.v3f32.v4f32.ll | 377 +- .../AMDGPU/shufflevector.v3i32.v2i32.ll | 391 +- .../AMDGPU/shufflevector.v3i32.v3i32.ll | 200 +- .../AMDGPU/shufflevector.v3i32.v4i32.ll | 377 +- .../AMDGPU/shufflevector.v3i64.v2i64.ll | 92 +- .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 92 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 391 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 200 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll | 377 +- .../AMDGPU/shufflevector.v4f32.v2f32.ll | 45 +- .../AMDGPU/shufflevector.v4f32.v3f32.ll | 1474 +-- .../AMDGPU/shufflevector.v4f32.v4f32.ll | 518 +- .../AMDGPU/shufflevector.v4i32.v2i32.ll | 45 +- .../AMDGPU/shufflevector.v4i32.v3i32.ll | 1474 +-- .../AMDGPU/shufflevector.v4i32.v4i32.ll | 518 +- .../AMDGPU/shufflevector.v4i64.v2i64.ll | 364 +- .../AMDGPU/shufflevector.v4i64.v3i64.ll | 204 +- .../AMDGPU/shufflevector.v4i64.v4i64.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 364 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 204 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 45 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 1474 +-- .../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 518 +- .../AMDGPU/undef-handling-crash-in-ra.ll | 23 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 22 +- 78 files changed, 23782 insertions(+), 18530 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 7e297f46a780e..9f1955c78eb36 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -23,9 +23,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_add_noret_f64: @@ -34,9 +34,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64: @@ -142,9 +142,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64: @@ -153,9 +153,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64: @@ -261,9 +261,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_add_noret_f64: @@ -272,9 +272,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64: @@ -379,9 +379,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64: @@ -390,9 +390,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64: @@ -497,9 +497,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_min_noret_f64: @@ -508,9 +508,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64: @@ -616,9 +616,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64: @@ -627,9 +627,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64: @@ -735,9 +735,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_min_noret_f64: @@ -746,9 +746,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64: @@ -853,9 +853,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64: @@ -864,9 +864,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64: @@ -971,9 +971,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_max_noret_f64: @@ -982,9 +982,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64: @@ -1090,9 +1090,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64: @@ -1101,9 +1101,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64: @@ -1209,9 +1209,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_max_noret_f64: @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64: @@ -1327,9 +1327,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64: @@ -1338,9 +1338,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64: diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll index 4c62409a85c00..2968e0441d349 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll @@ -183,122 +183,125 @@ define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av_no_agprs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a33, v31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a32, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a30, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v0, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a6 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a7 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a8 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a9 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a10 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a11 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a12 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a13 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a14 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a15 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a16 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a17 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a18 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a19 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a20 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a21 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a22 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a27 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a28 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a29 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a30 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a31 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a32 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 +; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] +; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10 %data = call i32 asm "; def $0", "=^VA"() @@ -744,122 +747,125 @@ define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av_no_agprs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a33, v31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a32, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a30, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v0, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a6 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a7 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a8 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a9 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a10 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a11 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a12 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a13 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a14 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a15 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a16 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a17 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a18 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a19 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a20 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a21 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a22 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a27 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a28 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a29 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a30 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a31 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a32 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 +; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] +; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10 %data = call i32 asm "; def $0", "=^VA"() diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index bc341f2baa804..e9192ca2d03ac 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -472,49 +472,46 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB14_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB14_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB14_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB14_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -533,53 +530,50 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB15_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -774,49 +768,46 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_v__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB19_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB19_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB19_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB19_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index d053425afbb6d..4a8225fcd6ad2 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -338,225 +338,264 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a2 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -640,43 +679,43 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB11_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB11_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB11_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -688,39 +727,39 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a[2:3] +; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB11_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB11_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off +; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off ; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -832,41 +871,41 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB13_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB13_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB13_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -878,37 +917,37 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB13_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB13_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off ; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -926,41 +965,40 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB14_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1016,41 +1054,40 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB15_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1106,41 +1143,40 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB16_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB16_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB16_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -1152,37 +1188,37 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB16_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB16_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB16_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off ; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -1384,12 +1420,10 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 @@ -1406,14 +1440,13 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1483,7 +1516,6 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB20_2 ; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private @@ -1560,12 +1592,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1590,12 +1622,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1695,12 +1727,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1724,12 +1756,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1892,12 +1924,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1921,12 +1953,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -2494,7 +2526,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB32_4 @@ -2512,9 +2544,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_2 @@ -2528,18 +2558,18 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2556,7 +2586,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB32_4 @@ -2573,9 +2603,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB32_2 @@ -2590,15 +2618,15 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2742,7 +2770,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB34_4 @@ -2760,9 +2788,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_2 @@ -2776,18 +2802,18 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2802,7 +2828,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB34_4 @@ -2819,9 +2845,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB34_2 @@ -2836,15 +2860,15 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -3101,7 +3125,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB37_4 @@ -3119,9 +3143,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_2 @@ -3135,18 +3157,18 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -3161,7 +3183,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB37_4 @@ -3178,9 +3200,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB37_2 @@ -3195,15 +3215,15 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4008,223 +4028,262 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a2 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -4308,39 +4367,37 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB53_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB53_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB53_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4355,37 +4412,35 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB53_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB53_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4497,38 +4552,36 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB55_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB55_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB55_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4541,37 +4594,35 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB55_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB55_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB55_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4766,38 +4817,36 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB58_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB58_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB58_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4810,37 +4859,35 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB58_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB58_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5440,13 +5487,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -5469,12 +5516,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6062,13 +6109,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6093,13 +6140,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6193,12 +6240,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6221,12 +6268,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6305,48 +6352,45 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB89_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB89_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB89_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_add_i64_ret_a_a: @@ -6354,43 +6398,41 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB89_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB89_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB89_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6488,48 +6530,45 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB91_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB91_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB91_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a: @@ -6537,45 +6576,43 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB91_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB91_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB91_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6675,48 +6712,45 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB93_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB93_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB93_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_and_i64_ret_a_a: @@ -6724,44 +6758,42 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB93_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB93_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB93_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6869,7 +6901,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB95_4 @@ -6886,8 +6918,6 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -6901,21 +6931,21 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB95_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_not_b32_e32 v2, v3 -; GFX90A-NEXT: v_not_b32_e32 v3, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v4, v0, v6 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6934,7 +6964,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB95_4 @@ -6951,8 +6981,6 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -6972,13 +7000,13 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v7 ; GFX950-NEXT: v_and_b32_e32 v5, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v5 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7118,48 +7146,45 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB97_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB97_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB97_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX90A-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_or_i64_ret_a_a: @@ -7167,44 +7192,42 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB97_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB97_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB97_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7309,43 +7332,40 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB99_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB99_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB99_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_max_i64_ret_a_a: @@ -7353,46 +7373,44 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB99_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB99_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB99_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7500,43 +7518,40 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB101_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB101_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB101_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_min_i64_ret_a_a: @@ -7544,46 +7559,44 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB101_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB101_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB101_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7691,43 +7704,40 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB103_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB103_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB103_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a: @@ -7735,46 +7745,44 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB103_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB103_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB103_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7882,43 +7890,40 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB105_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB105_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB105_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a: @@ -7926,46 +7931,44 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB105_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB105_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB105_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8073,45 +8076,42 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB107_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB107_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB107_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a: @@ -8119,46 +8119,45 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB107_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB107_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB107_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 -; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8263,53 +8262,50 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB109_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB109_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB109_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: @@ -8317,48 +8313,46 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB109_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB109_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB109_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8466,64 +8460,62 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB111_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[6:7] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB111_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB111_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -8542,7 +8534,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB111_4 @@ -8562,8 +8554,6 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -8585,13 +8575,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -8748,7 +8739,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB113_4 @@ -8766,8 +8757,6 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -8789,14 +8778,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -8815,7 +8804,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB113_4 @@ -8835,8 +8824,6 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -8858,13 +8845,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -9022,55 +9010,53 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB115_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc -; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: .LBB115_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB115_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: .LBB115_6: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB115_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a: @@ -9189,12 +9175,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9217,12 +9203,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9312,13 +9298,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9342,13 +9328,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9442,13 +9428,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9472,13 +9458,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9573,13 +9559,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9602,12 +9588,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9701,13 +9687,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9730,12 +9716,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9817,68 +9803,63 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB127_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB127_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB127_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB127_6: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB127_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: @@ -9886,65 +9867,61 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB127_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB127_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB127_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB127_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB127_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10089,7 +10066,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB129_4 @@ -10103,9 +10080,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 @@ -10121,15 +10096,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10148,7 +10122,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB129_4 @@ -10162,9 +10136,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB129_2 @@ -10181,12 +10153,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -10313,49 +10285,46 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB131_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB131_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB131_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a: @@ -10363,45 +10332,43 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB131_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB131_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB131_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10502,49 +10469,46 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB133_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB133_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB133_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a: @@ -10552,45 +10516,43 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB133_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB133_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB133_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10700,7 +10662,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB135_4 @@ -10718,8 +10680,6 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -10737,18 +10697,17 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10767,7 +10726,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB135_4 @@ -10786,8 +10745,6 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -10808,13 +10765,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -10968,7 +10926,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB137_4 @@ -10986,8 +10944,6 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -11005,18 +10961,17 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11035,7 +10990,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB137_4 @@ -11054,8 +11009,6 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -11076,13 +11029,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11245,12 +11199,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB139_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11342,12 +11296,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11370,12 +11324,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11465,13 +11419,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11496,13 +11450,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11597,13 +11551,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11628,13 +11582,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11734,13 +11688,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11763,12 +11717,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11870,13 +11824,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11899,12 +11853,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12021,13 +11975,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12154,13 +12108,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12188,13 +12142,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12324,13 +12278,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12358,13 +12312,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12494,13 +12448,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12528,13 +12482,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12669,13 +12623,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12703,13 +12657,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12849,13 +12803,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12883,13 +12837,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13328,13 +13282,13 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13358,12 +13312,12 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14089,13 +14043,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14121,13 +14075,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14224,12 +14178,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14253,12 +14207,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14355,28 +14309,26 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB193_3 ; GFX90A-NEXT: s_branch .LBB193_4 ; GFX90A-NEXT: .LBB193_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a: @@ -14398,25 +14350,23 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: s_cbranch_execz .LBB193_3 ; GFX950-NEXT: s_branch .LBB193_4 ; GFX950-NEXT: .LBB193_2: -; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0 ; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a[2:3] +; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14523,32 +14473,28 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB195_3 ; GFX90A-NEXT: s_branch .LBB195_4 ; GFX90A-NEXT: .LBB195_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a: @@ -14570,27 +14516,24 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB195_3 ; GFX950-NEXT: s_branch .LBB195_4 ; GFX950-NEXT: .LBB195_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14700,32 +14643,28 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB197_3 ; GFX90A-NEXT: s_branch .LBB197_4 ; GFX90A-NEXT: .LBB197_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a: @@ -14739,37 +14678,34 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB197_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB197_3 ; GFX950-NEXT: s_branch .LBB197_4 ; GFX950-NEXT: .LBB197_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14881,32 +14817,28 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB199_3 ; GFX90A-NEXT: s_branch .LBB199_4 ; GFX90A-NEXT: .LBB199_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a: @@ -14920,36 +14852,33 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB199_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB199_3 ; GFX950-NEXT: s_branch .LBB199_4 ; GFX950-NEXT: .LBB199_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15070,8 +14999,6 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -15080,25 +15007,25 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB201_6 ; GFX90A-NEXT: .LBB201_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB201_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_not_b32_e32 v2, v3 -; GFX90A-NEXT: v_not_b32_e32 v3, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -15134,8 +15061,6 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -15144,7 +15069,7 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB201_6 ; GFX950-NEXT: .LBB201_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB201_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -15153,12 +15078,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 ; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -15310,32 +15235,28 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB203_3 ; GFX90A-NEXT: s_branch .LBB203_4 ; GFX90A-NEXT: .LBB203_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a: @@ -15349,36 +15270,33 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB203_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB203_3 ; GFX950-NEXT: s_branch .LBB203_4 ; GFX950-NEXT: .LBB203_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX950-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15489,32 +15407,28 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB205_3 ; GFX90A-NEXT: s_branch .LBB205_4 ; GFX90A-NEXT: .LBB205_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a: @@ -15528,36 +15442,33 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB205_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB205_3 ; GFX950-NEXT: s_branch .LBB205_4 ; GFX950-NEXT: .LBB205_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15668,33 +15579,29 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB207_3 ; GFX90A-NEXT: s_branch .LBB207_4 ; GFX90A-NEXT: .LBB207_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a: @@ -15708,38 +15615,35 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB207_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB207_3 ; GFX950-NEXT: s_branch .LBB207_4 ; GFX950-NEXT: .LBB207_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15853,33 +15757,29 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB209_3 ; GFX90A-NEXT: s_branch .LBB209_4 ; GFX90A-NEXT: .LBB209_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a: @@ -15893,38 +15793,35 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB209_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB209_3 ; GFX950-NEXT: s_branch .LBB209_4 ; GFX950-NEXT: .LBB209_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16038,33 +15935,29 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB211_3 ; GFX90A-NEXT: s_branch .LBB211_4 ; GFX90A-NEXT: .LBB211_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a: @@ -16078,38 +15971,35 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB211_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB211_3 ; GFX950-NEXT: s_branch .LBB211_4 ; GFX950-NEXT: .LBB211_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16223,33 +16113,29 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB213_3 ; GFX90A-NEXT: s_branch .LBB213_4 ; GFX90A-NEXT: .LBB213_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a: @@ -16263,38 +16149,35 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB213_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB213_3 ; GFX950-NEXT: s_branch .LBB213_4 ; GFX950-NEXT: .LBB213_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16400,43 +16283,39 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB215_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB215_3 ; GFX90A-NEXT: s_branch .LBB215_4 ; GFX90A-NEXT: .LBB215_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a: @@ -16450,38 +16329,36 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB215_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB215_3 ; GFX950-NEXT: s_branch .LBB215_4 ; GFX950-NEXT: .LBB215_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1 -; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16598,37 +16475,33 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB217_3 ; GFX90A-NEXT: s_branch .LBB217_4 ; GFX90A-NEXT: .LBB217_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a: @@ -16642,40 +16515,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB217_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB217_3 ; GFX950-NEXT: s_branch .LBB217_4 ; GFX950-NEXT: .LBB217_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16806,8 +16676,6 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -16816,7 +16684,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB219_6 ; GFX90A-NEXT: .LBB219_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB219_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -16829,13 +16697,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc ; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -16874,8 +16742,6 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -16884,7 +16750,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB219_6 ; GFX950-NEXT: .LBB219_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB219_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -16895,12 +16761,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -17071,8 +16938,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -17081,7 +16946,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB221_6 ; GFX90A-NEXT: .LBB221_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB221_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -17094,13 +16959,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -17139,8 +17004,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -17149,7 +17012,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB221_6 ; GFX950-NEXT: .LBB221_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB221_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -17160,12 +17023,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -17333,38 +17197,36 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_cbranch_execz .LBB223_5 ; GFX90A-NEXT: s_branch .LBB223_6 ; GFX90A-NEXT: .LBB223_3: -; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_branch .LBB223_7 ; GFX90A-NEXT: .LBB223_4: -; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0 +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GFX90A-NEXT: .LBB223_6: ; %Flow1 ; GFX90A-NEXT: s_cbranch_execnz .LBB223_8 ; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0 +; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: @@ -17485,12 +17347,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17514,12 +17376,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17597,29 +17459,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17629,29 +17491,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17735,29 +17597,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17767,29 +17629,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17873,29 +17735,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17919,12 +17782,12 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18005,29 +17868,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -18051,12 +17915,12 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18161,27 +18025,21 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB235_5 ; GFX90A-NEXT: s_branch .LBB235_6 ; GFX90A-NEXT: .LBB235_3: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_branch .LBB235_7 ; GFX90A-NEXT: .LBB235_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB235_6: ; %Flow1 @@ -18189,16 +18047,17 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: @@ -18225,40 +18084,36 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB235_5 ; GFX950-NEXT: s_branch .LBB235_6 ; GFX950-NEXT: .LBB235_3: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_branch .LBB235_7 ; GFX950-NEXT: .LBB235_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-NEXT: .LBB235_6: ; %Flow1 ; GFX950-NEXT: s_cbranch_execnz .LBB235_8 ; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v2, s0 -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18408,9 +18263,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 @@ -18418,7 +18271,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB237_6 ; GFX90A-NEXT: .LBB237_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB237_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -18426,14 +18279,13 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -18466,9 +18318,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_2 @@ -18476,18 +18326,18 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB237_6 ; GFX950-NEXT: .LBB237_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB237_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -18618,41 +18468,37 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB239_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB239_3 ; GFX90A-NEXT: s_branch .LBB239_4 ; GFX90A-NEXT: .LBB239_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a: @@ -18674,13 +18520,10 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB239_3 ; GFX950-NEXT: s_branch .LBB239_4 ; GFX950-NEXT: .LBB239_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 @@ -18688,15 +18531,15 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18801,41 +18644,37 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB241_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB241_3 ; GFX90A-NEXT: s_branch .LBB241_4 ; GFX90A-NEXT: .LBB241_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a: @@ -18857,13 +18696,10 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB241_3 ; GFX950-NEXT: s_branch .LBB241_4 ; GFX950-NEXT: .LBB241_2: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 @@ -18871,15 +18707,15 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -19003,8 +18839,6 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -19013,7 +18847,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB243_6 ; GFX90A-NEXT: .LBB243_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB243_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -19022,17 +18856,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -19070,8 +18903,6 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -19080,7 +18911,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB243_6 ; GFX950-NEXT: .LBB243_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB243_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -19090,12 +18921,13 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -19263,8 +19095,6 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -19273,7 +19103,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB245_6 ; GFX90A-NEXT: .LBB245_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB245_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -19282,17 +19112,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -19330,8 +19159,6 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -19340,7 +19167,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB245_6 ; GFX950-NEXT: .LBB245_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB245_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -19350,12 +19177,13 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -19515,12 +19343,12 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB247_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19618,12 +19446,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19647,12 +19475,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19730,29 +19558,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19762,30 +19590,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19870,29 +19698,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19902,30 +19730,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20031,13 +19859,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20061,12 +19889,12 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20172,13 +20000,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20202,12 +20030,12 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20328,13 +20156,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20469,13 +20297,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20505,13 +20333,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20647,13 +20475,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20683,13 +20511,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20825,13 +20653,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20861,13 +20689,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -21008,13 +20836,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -21044,13 +20872,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -21196,13 +21024,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -21232,13 +21060,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll index 063feec759efa..37a44d8b4b7d1 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll @@ -449,13 +449,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -483,13 +483,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -515,8 +515,8 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -545,8 +545,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND @@ -661,8 +661,8 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -691,8 +691,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index c98fff96d7b8a..c54421ae64528 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -338,225 +338,264 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a2 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -1062,12 +1101,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1092,12 +1131,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1197,12 +1236,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1226,12 +1265,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1394,12 +1433,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1423,12 +1462,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -2007,14 +2046,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2040,14 +2079,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB32_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2151,14 +2190,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2182,14 +2221,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB34_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2356,14 +2395,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2387,14 +2426,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB37_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2986,223 +3025,262 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a2 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -3893,13 +3971,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -3922,12 +4000,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4515,13 +4593,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4546,13 +4624,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4646,12 +4724,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4674,12 +4752,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -5000,14 +5078,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB95_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5033,14 +5111,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB95_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5664,14 +5742,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5700,14 +5778,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB111_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5810,14 +5888,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB113_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5846,14 +5924,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB113_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -6027,12 +6105,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6055,12 +6133,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6150,13 +6228,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6180,13 +6258,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6280,13 +6358,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6310,13 +6388,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6411,13 +6489,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6440,12 +6518,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6539,13 +6617,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6568,12 +6646,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6744,14 +6822,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6774,14 +6852,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7024,14 +7102,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7059,14 +7137,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7168,14 +7246,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7203,14 +7281,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7383,12 +7461,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7411,12 +7489,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7506,13 +7584,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7537,13 +7615,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7638,13 +7716,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7669,13 +7747,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7775,13 +7853,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7804,12 +7882,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7911,13 +7989,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7940,12 +8018,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8062,13 +8140,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8195,13 +8273,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8229,13 +8307,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8365,13 +8443,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8399,13 +8477,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8535,13 +8613,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8569,13 +8647,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8710,13 +8788,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8744,13 +8822,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8890,13 +8968,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8924,13 +9002,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9353,13 +9431,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9383,12 +9461,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10082,13 +10160,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10114,13 +10192,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10217,12 +10295,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10246,12 +10324,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10662,14 +10740,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB201_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10696,14 +10774,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB201_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11429,14 +11507,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB219_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11466,14 +11544,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB219_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11579,14 +11657,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB221_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11616,14 +11694,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB221_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11804,12 +11882,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11833,12 +11911,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11931,13 +12009,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11962,13 +12040,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12065,13 +12143,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12096,13 +12174,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12200,13 +12278,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12230,12 +12308,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12332,13 +12410,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12362,12 +12440,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12544,14 +12622,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12575,14 +12653,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12834,14 +12912,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12870,14 +12948,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12982,14 +13060,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -13018,14 +13096,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13205,12 +13283,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13234,12 +13312,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13332,13 +13410,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13364,13 +13442,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13468,13 +13546,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13500,13 +13578,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13609,13 +13687,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13639,12 +13717,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13749,13 +13827,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13779,12 +13857,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13904,13 +13982,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14041,13 +14119,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14076,13 +14154,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14215,13 +14293,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14250,13 +14328,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14389,13 +14467,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14424,13 +14502,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14568,13 +14646,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14603,13 +14681,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14752,13 +14830,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14787,13 +14865,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 9e240238c1066..ebbeab94066d6 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v35 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 63b7b70548baf..0c5fd1fc0932a 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -180,55 +180,63 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -313,57 +321,65 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[0:31] +; GFX908-NEXT: ; def a[4:35] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: v_accvgpr_read_b32 v0, a32 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a33 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a35 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -448,57 +464,65 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[0:31] +; GFX908-NEXT: ; def a[32:63] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: v_accvgpr_read_b32 v0, a60 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a61 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a62 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a63 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a56 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a57 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a58 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a59 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a52 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a53 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a54 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a55 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a48 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a49 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a50 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a51 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a44 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a45 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a46 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a47 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a40 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a41 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a42 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a43 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a36 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a37 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a38 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a39 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a32 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a33 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a35 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -585,55 +609,63 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -718,57 +750,65 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[0:31] +; GFX908-NEXT: ; def a[64:95] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: v_accvgpr_read_b32 v0, a92 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a93 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a94 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a95 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a88 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a89 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a90 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a91 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a84 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a85 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a86 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a87 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a80 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a81 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a82 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a83 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a76 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a77 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a78 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a79 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a72 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a73 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a74 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a75 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a68 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a69 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a70 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a71 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a64 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a65 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a66 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a67 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -849,61 +889,125 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v43, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v47, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v46, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v45, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v44, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v58, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v57, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v56, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v63, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v62, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v60, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v75, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v74, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v73, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v72, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v79, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v78, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v77, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v76, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v91, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v90, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v89, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v88, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v95, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v94, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v93, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v92, a28 +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX908-NEXT: s_mov_b32 s32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v95, a0 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v94, a1 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v93, a2 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v92, a3 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v91, a4 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v90, a5 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v89, a6 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v88, a7 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v79, a8 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v78, a9 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v77, a10 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v76, a11 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v75, a12 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v74, a13 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v73, a14 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v72, a15 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v60, a19 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v59, a20 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v58, a21 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v57, a22 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v56, a23 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v47, a24 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v46, a25 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v45, a26 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v44, a27 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v43, a28 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v42, a29 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v41, a30 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v40, a31 ; Reload Reuse ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[92:95], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[88:91], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[76:79], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[72:75], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[60:63], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[56:59], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[44:47], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[40:43], off +; GFX908-NEXT: v_mov_b32_e32 v4, v95 +; GFX908-NEXT: v_mov_b32_e32 v5, v94 +; GFX908-NEXT: v_mov_b32_e32 v6, v93 +; GFX908-NEXT: v_mov_b32_e32 v7, v92 +; GFX908-NEXT: v_mov_b32_e32 v8, v91 +; GFX908-NEXT: v_mov_b32_e32 v9, v90 +; GFX908-NEXT: v_mov_b32_e32 v10, v89 +; GFX908-NEXT: v_mov_b32_e32 v11, v88 +; GFX908-NEXT: v_mov_b32_e32 v12, v79 +; GFX908-NEXT: v_mov_b32_e32 v13, v78 +; GFX908-NEXT: v_mov_b32_e32 v14, v77 +; GFX908-NEXT: v_mov_b32_e32 v15, v76 +; GFX908-NEXT: v_mov_b32_e32 v16, v75 +; GFX908-NEXT: v_mov_b32_e32 v17, v74 +; GFX908-NEXT: v_mov_b32_e32 v18, v73 +; GFX908-NEXT: v_mov_b32_e32 v19, v72 +; GFX908-NEXT: v_mov_b32_e32 v20, v63 +; GFX908-NEXT: v_mov_b32_e32 v21, v62 +; GFX908-NEXT: v_mov_b32_e32 v22, v61 +; GFX908-NEXT: v_mov_b32_e32 v23, v60 +; GFX908-NEXT: v_mov_b32_e32 v24, v59 +; GFX908-NEXT: v_mov_b32_e32 v25, v58 +; GFX908-NEXT: v_mov_b32_e32 v26, v57 +; GFX908-NEXT: v_mov_b32_e32 v27, v56 +; GFX908-NEXT: v_mov_b32_e32 v28, v47 +; GFX908-NEXT: v_mov_b32_e32 v29, v46 +; GFX908-NEXT: v_mov_b32_e32 v30, v45 +; GFX908-NEXT: v_mov_b32_e32 v31, v44 +; GFX908-NEXT: v_mov_b32_e32 v32, v43 +; GFX908-NEXT: v_mov_b32_e32 v33, v42 +; GFX908-NEXT: v_mov_b32_e32 v34, v41 +; GFX908-NEXT: v_mov_b32_e32 v35, v40 +; GFX908-NEXT: v_mov_b32_e32 v0, v32 +; GFX908-NEXT: v_mov_b32_e32 v1, v33 +; GFX908-NEXT: v_mov_b32_e32 v2, v34 +; GFX908-NEXT: v_mov_b32_e32 v3, v35 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v28 +; GFX908-NEXT: v_mov_b32_e32 v1, v29 +; GFX908-NEXT: v_mov_b32_e32 v2, v30 +; GFX908-NEXT: v_mov_b32_e32 v3, v31 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v24 +; GFX908-NEXT: v_mov_b32_e32 v1, v25 +; GFX908-NEXT: v_mov_b32_e32 v2, v26 +; GFX908-NEXT: v_mov_b32_e32 v3, v27 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v20 +; GFX908-NEXT: v_mov_b32_e32 v1, v21 +; GFX908-NEXT: v_mov_b32_e32 v2, v22 +; GFX908-NEXT: v_mov_b32_e32 v3, v23 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v16 +; GFX908-NEXT: v_mov_b32_e32 v1, v17 +; GFX908-NEXT: v_mov_b32_e32 v2, v18 +; GFX908-NEXT: v_mov_b32_e32 v3, v19 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v12 +; GFX908-NEXT: v_mov_b32_e32 v1, v13 +; GFX908-NEXT: v_mov_b32_e32 v2, v14 +; GFX908-NEXT: v_mov_b32_e32 v3, v15 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v8 +; GFX908-NEXT: v_mov_b32_e32 v1, v9 +; GFX908-NEXT: v_mov_b32_e32 v2, v10 +; GFX908-NEXT: v_mov_b32_e32 v3, v11 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: v_mov_b32_e32 v3, v7 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll index 1a2dd6e5f90f6..1180fc7b35a0b 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -29,17 +29,17 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32 ; GFX908-LABEL: remat_regcopy_avoids_spill: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v8 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v8 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v6 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index c3b14e8829042..2cbf39e2464bc 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -423,7 +423,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: buffer_atomic_add_f32 v6, v4, s[4:7], 0 offen offset:1024 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -431,7 +432,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -541,7 +542,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: buffer_atomic_add_f32 v6, v4, s[8:11], 0 offen offset:1024 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -549,7 +551,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,8 +2443,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 @@ -2456,6 +2458,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -2607,8 +2610,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -2620,6 +2623,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -4485,7 +4489,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4499,6 +4502,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -4774,7 +4778,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -4786,6 +4789,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6348,7 +6352,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6361,6 +6364,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -6674,7 +6678,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6686,6 +6689,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -7528,7 +7532,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[4:7], 0 offen offset:1024 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -7536,7 +7541,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -7682,7 +7687,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[8:11], 0 offen offset:1024 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -7690,7 +7696,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9975,7 +9981,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -9988,6 +9993,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_4 @@ -10301,7 +10307,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -10313,6 +10318,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index f7a1fb35c8106..187c8c9c11fa3 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -429,7 +429,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX942-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -443,6 +442,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 @@ -549,7 +549,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -561,6 +560,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 @@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1668,6 +1668,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -1783,8 +1784,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -1796,6 +1797,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -3603,7 +3605,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3617,6 +3618,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 @@ -3902,7 +3904,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3914,6 +3915,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 @@ -5484,7 +5486,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -5497,6 +5498,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -5810,7 +5812,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -5822,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6876,7 +6878,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6889,6 +6890,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -7068,7 +7070,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7080,6 +7081,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -8665,7 +8667,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8678,6 +8679,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 @@ -8991,7 +8993,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9003,6 +9004,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 8ac6353133e72..acbea3921b616 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -429,7 +429,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX942-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -443,6 +442,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 @@ -549,7 +549,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -561,6 +560,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 @@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1668,6 +1668,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -1783,8 +1784,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -1796,6 +1797,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -3603,7 +3605,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3617,6 +3618,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 @@ -3902,7 +3904,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3914,6 +3915,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 @@ -5484,7 +5486,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -5497,6 +5498,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -5810,7 +5812,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -5822,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6876,7 +6878,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6889,6 +6890,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -7068,7 +7070,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7080,6 +7081,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -8665,7 +8667,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8678,6 +8679,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 @@ -8991,7 +8993,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9003,6 +9004,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 3c991cfb7a1aa..0199e2866b35d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -258,68 +258,59 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 -; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_nop 1 -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -440,46 +431,58 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s0, v1 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v2, s[8:11], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v2, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v2, s[8:11], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v2, s[8:11], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v2, s[8:11], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v2, s[8:11], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v2, s[8:11], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v2, s[8:11], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v2, s[8:11], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v2, s[8:11], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v2, s[8:11], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v2, s[8:11], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v2, s[8:11], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v2, s[8:11], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v2, s[8:11], 0 offen offset:224 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v2, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v2, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v2, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v2, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v2, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v2, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v2, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v2, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v2, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v2, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v2, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v2, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v2, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v2, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v2, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v2, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v2, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; GISEL-GFX942-NEXT: s_endpgm @@ -820,30 +823,41 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0 +; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s8, v0 ; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 ; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse -; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224 -; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -977,32 +991,43 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s12, v0 ; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse -; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; GISEL-GFX942-NEXT: s_endpgm @@ -1146,8 +1171,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen ; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; SDAG-GFX942-NEXT: s_mov_b32 s5, s12 @@ -1158,12 +1183,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_endpgm ; ; SDAG-GFX1100-LABEL: memcpy_known_small: @@ -1217,8 +1242,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 ; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 @@ -1229,12 +1254,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s0 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16 ; GISEL-GFX942-NEXT: s_endpgm ; ; GISEL-GFX1100-LABEL: memcpy_known_small: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 683887b0a55f3..8b998354b1f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -426,122 +426,126 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-LABEL: ds_write2_b32_av_av_no_vgprs: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: v_accvgpr_write_b32 a0, v0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def a1 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a2 +; GCN-NEXT: ; def a34 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a33, v31 +; GCN-NEXT: v_accvgpr_write_b32 a32, v30 +; GCN-NEXT: v_accvgpr_write_b32 a31, v29 +; GCN-NEXT: v_accvgpr_write_b32 a30, v28 +; GCN-NEXT: v_accvgpr_write_b32 a29, v27 +; GCN-NEXT: v_accvgpr_write_b32 a28, v26 +; GCN-NEXT: v_accvgpr_write_b32 a27, v25 +; GCN-NEXT: v_accvgpr_write_b32 a26, v24 +; GCN-NEXT: v_accvgpr_write_b32 a25, v23 +; GCN-NEXT: v_accvgpr_write_b32 a24, v22 +; GCN-NEXT: v_accvgpr_write_b32 a23, v21 +; GCN-NEXT: v_accvgpr_write_b32 a22, v20 +; GCN-NEXT: v_accvgpr_write_b32 a21, v19 +; GCN-NEXT: v_accvgpr_write_b32 a20, v18 +; GCN-NEXT: v_accvgpr_write_b32 a19, v17 +; GCN-NEXT: v_accvgpr_write_b32 a18, v16 +; GCN-NEXT: v_accvgpr_write_b32 a17, v15 +; GCN-NEXT: v_accvgpr_write_b32 a16, v14 +; GCN-NEXT: v_accvgpr_write_b32 a15, v13 +; GCN-NEXT: v_accvgpr_write_b32 a14, v12 +; GCN-NEXT: v_accvgpr_write_b32 a13, v11 +; GCN-NEXT: v_accvgpr_write_b32 a12, v10 +; GCN-NEXT: v_accvgpr_write_b32 a11, v9 +; GCN-NEXT: v_accvgpr_write_b32 a10, v8 +; GCN-NEXT: v_accvgpr_write_b32 a9, v7 +; GCN-NEXT: v_accvgpr_write_b32 a8, v6 +; GCN-NEXT: v_accvgpr_write_b32 a7, v5 +; GCN-NEXT: v_accvgpr_write_b32 a6, v4 +; GCN-NEXT: v_accvgpr_write_b32 a5, v3 +; GCN-NEXT: v_accvgpr_write_b32 a4, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v0 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v2, a34 ; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v0, a2 +; GCN-NEXT: v_accvgpr_read_b32 v1, a3 +; GCN-NEXT: v_accvgpr_read_b32 v2, a4 +; GCN-NEXT: v_accvgpr_read_b32 v3, a5 +; GCN-NEXT: v_accvgpr_read_b32 v4, a6 +; GCN-NEXT: v_accvgpr_read_b32 v5, a7 +; GCN-NEXT: v_accvgpr_read_b32 v6, a8 +; GCN-NEXT: v_accvgpr_read_b32 v7, a9 +; GCN-NEXT: v_accvgpr_read_b32 v8, a10 +; GCN-NEXT: v_accvgpr_read_b32 v9, a11 +; GCN-NEXT: v_accvgpr_read_b32 v10, a12 +; GCN-NEXT: v_accvgpr_read_b32 v11, a13 +; GCN-NEXT: v_accvgpr_read_b32 v12, a14 +; GCN-NEXT: v_accvgpr_read_b32 v13, a15 +; GCN-NEXT: v_accvgpr_read_b32 v14, a16 +; GCN-NEXT: v_accvgpr_read_b32 v15, a17 +; GCN-NEXT: v_accvgpr_read_b32 v16, a18 +; GCN-NEXT: v_accvgpr_read_b32 v17, a19 +; GCN-NEXT: v_accvgpr_read_b32 v18, a20 +; GCN-NEXT: v_accvgpr_read_b32 v19, a21 +; GCN-NEXT: v_accvgpr_read_b32 v20, a22 +; GCN-NEXT: v_accvgpr_read_b32 v21, a23 +; GCN-NEXT: v_accvgpr_read_b32 v22, a24 +; GCN-NEXT: v_accvgpr_read_b32 v23, a25 +; GCN-NEXT: v_accvgpr_read_b32 v24, a26 +; GCN-NEXT: v_accvgpr_read_b32 v25, a27 +; GCN-NEXT: v_accvgpr_read_b32 v26, a28 +; GCN-NEXT: v_accvgpr_read_b32 v27, a29 +; GCN-NEXT: v_accvgpr_read_b32 v28, a30 +; GCN-NEXT: v_accvgpr_read_b32 v29, a31 +; GCN-NEXT: v_accvgpr_read_b32 v30, a32 +; GCN-NEXT: v_accvgpr_read_b32 v31, a33 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24 @@ -976,123 +980,133 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a6, v41 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a7, v42 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a8, v43 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a9, v44 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a10, v45 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a11, v46 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a12, v47 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a13, v56 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a14, v57 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a15, v58 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a16, v59 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a17, v60 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a18, v61 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a20, v63 ; Reload Reuse +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword a37, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: v_accvgpr_write_b32 a0, v0 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a[2:3] +; GCN-NEXT: ; def a[34:35] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a[4:5] +; GCN-NEXT: ; def a[36:37] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_write_b32 a33, v31 +; GCN-NEXT: v_accvgpr_write_b32 a32, v30 +; GCN-NEXT: v_accvgpr_write_b32 a31, v29 +; GCN-NEXT: v_accvgpr_write_b32 a30, v28 +; GCN-NEXT: v_accvgpr_write_b32 a29, v27 +; GCN-NEXT: v_accvgpr_write_b32 a28, v26 +; GCN-NEXT: v_accvgpr_write_b32 a27, v25 +; GCN-NEXT: v_accvgpr_write_b32 a26, v24 +; GCN-NEXT: v_accvgpr_write_b32 a25, v23 +; GCN-NEXT: v_accvgpr_write_b32 a24, v22 +; GCN-NEXT: v_accvgpr_write_b32 a23, v21 +; GCN-NEXT: v_accvgpr_write_b32 a22, v20 +; GCN-NEXT: v_accvgpr_write_b32 a21, v19 +; GCN-NEXT: v_accvgpr_write_b32 a20, v18 +; GCN-NEXT: v_accvgpr_write_b32 a19, v17 +; GCN-NEXT: v_accvgpr_write_b32 a18, v16 +; GCN-NEXT: v_accvgpr_write_b32 a17, v15 +; GCN-NEXT: v_accvgpr_write_b32 a16, v14 +; GCN-NEXT: v_accvgpr_write_b32 a15, v13 +; GCN-NEXT: v_accvgpr_write_b32 a14, v12 +; GCN-NEXT: v_accvgpr_write_b32 a13, v11 +; GCN-NEXT: v_accvgpr_write_b32 a12, v10 +; GCN-NEXT: v_accvgpr_write_b32 a11, v9 +; GCN-NEXT: v_accvgpr_write_b32 a10, v8 +; GCN-NEXT: v_accvgpr_write_b32 a9, v7 +; GCN-NEXT: v_accvgpr_write_b32 a8, v6 +; GCN-NEXT: v_accvgpr_write_b32 a7, v5 +; GCN-NEXT: v_accvgpr_write_b32 a6, v4 +; GCN-NEXT: v_accvgpr_write_b32 a5, v3 +; GCN-NEXT: v_accvgpr_write_b32 a4, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v0 +; GCN-NEXT: v_accvgpr_read_b32 v2, a34 +; GCN-NEXT: v_accvgpr_read_b32 v4, a36 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v3, a35 +; GCN-NEXT: v_accvgpr_read_b32 v5, a37 ; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a28, v24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a27, v25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a26, v26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a25, v27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: v_accvgpr_read_b32 v21, a31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v22, a30 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v23, a29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v24, a28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v25, a27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v26, a26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v27, a25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v28, a24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v29, a23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v30, a22 ; Reload Reuse -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_accvgpr_read_b32 v31, a21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v0, a2 +; GCN-NEXT: v_accvgpr_read_b32 v1, a3 +; GCN-NEXT: v_accvgpr_read_b32 v2, a4 +; GCN-NEXT: v_accvgpr_read_b32 v3, a5 +; GCN-NEXT: v_accvgpr_read_b32 v4, a6 +; GCN-NEXT: v_accvgpr_read_b32 v5, a7 +; GCN-NEXT: v_accvgpr_read_b32 v6, a8 +; GCN-NEXT: v_accvgpr_read_b32 v7, a9 +; GCN-NEXT: v_accvgpr_read_b32 v8, a10 +; GCN-NEXT: v_accvgpr_read_b32 v9, a11 +; GCN-NEXT: v_accvgpr_read_b32 v10, a12 +; GCN-NEXT: v_accvgpr_read_b32 v11, a13 +; GCN-NEXT: v_accvgpr_read_b32 v12, a14 +; GCN-NEXT: v_accvgpr_read_b32 v13, a15 +; GCN-NEXT: v_accvgpr_read_b32 v14, a16 +; GCN-NEXT: v_accvgpr_read_b32 v15, a17 +; GCN-NEXT: v_accvgpr_read_b32 v16, a18 +; GCN-NEXT: v_accvgpr_read_b32 v17, a19 +; GCN-NEXT: v_accvgpr_read_b32 v18, a20 +; GCN-NEXT: v_accvgpr_read_b32 v19, a21 +; GCN-NEXT: v_accvgpr_read_b32 v20, a22 +; GCN-NEXT: v_accvgpr_read_b32 v21, a23 +; GCN-NEXT: v_accvgpr_read_b32 v22, a24 +; GCN-NEXT: v_accvgpr_read_b32 v23, a25 +; GCN-NEXT: v_accvgpr_read_b32 v24, a26 +; GCN-NEXT: v_accvgpr_read_b32 v25, a27 +; GCN-NEXT: v_accvgpr_read_b32 v26, a28 +; GCN-NEXT: v_accvgpr_read_b32 v27, a29 +; GCN-NEXT: v_accvgpr_read_b32 v28, a30 +; GCN-NEXT: v_accvgpr_read_b32 v29, a31 +; GCN-NEXT: v_accvgpr_read_b32 v30, a32 +; GCN-NEXT: v_accvgpr_read_b32 v31, a33 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_accvgpr_read_b32 v63, a20 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v60, a17 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v59, a16 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v57, a14 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v56, a13 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v47, a12 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v46, a11 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v45, a10 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v44, a9 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v43, a8 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v42, a7 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v41, a6 ; Reload Reuse +; GCN-NEXT: buffer_load_dword a37, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GCN-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 24 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1e7855ccb3642..af817c3ee4eb1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -1012,7 +1012,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private @@ -1045,7 +1044,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private @@ -1169,7 +1167,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private @@ -1206,7 +1203,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 57bfd2490f9da..d973f7b71fb6d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half8: @@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] +; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half6: diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index 597f90c0f4e84..554d4f69ea4a2 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -43,8 +43,7 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy -; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 -; GCN: ; illegal copy [[COPY1]] to s9 +; GCN: ; illegal copy a1 to s9 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) @@ -52,9 +51,7 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy -; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 -; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 -; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11] +; GCN: ; illegal copy a[0:1] to s[10:11] define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index b91963f08681c..364d2f52777d3 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_barrier -; GFX90A-NEXT: ds_read_b32 v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: barrier_release: @@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_barrier ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-LABEL: barrier_release: @@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_barrier -; GFX942-NEXT: ds_read_b32 v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: barrier_release: @@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_barrier ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX10WGP-LABEL: barrier_release: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index 3e96dfe40f745..a57b43a81205b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -37,11 +37,11 @@ entry: define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX950-SDAG-LABEL: ds_read_b96_tr_b6: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: ds_read_b96_tr_b6: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee49b93f..fb32a83f3cf3c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: ds_read_b32 v2, v1 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: ds_write_b32 v0, v2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ds_read_b32 v0, v2 offset:256 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ds_read_b32 v1, v1 offset:256 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v1, v0 offset:256 +; GCN-NEXT: ds_write_b32 v0, v1 offset:256 ; GCN-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index 49607e320bd0a..efd5df85280e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -39,9 +39,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -70,14 +68,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() @@ -92,8 +86,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 -; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -106,8 +99,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -123,9 +115,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 % ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -139,9 +129,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i64 asm "; def $0", "=a"() %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -154,14 +142,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll index 12a998ad82cd2..92a5f88246888 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -89,59 +89,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm @@ -255,25 +255,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 9 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 ; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; GFX908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; GFX908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX908-NEXT: s_endpgm @@ -422,22 +422,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a3 ; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll index 87a7c2ef6c95c..c21d86684e445 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll @@ -8,10 +8,10 @@ define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg ; HEURRC-LABEL: default: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -34,10 +34,10 @@ define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> ; HEURRC-LABEL: request_agpr: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 5ab8706f28f5f..22bc62acce15d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0 ; GFX90A-VGPR-NEXT: s_nop 3 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-VGPR-NEXT: s_nop 7 ; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0 +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 ; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s11 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v1, s11 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 @@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 @@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: v_mov_b32_e32 v0, s10 ; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v1, s11 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 @@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 @@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s10 ; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s11 +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 @@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s10 ; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s11 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 @@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index dc4c929124fec..bc4822ef32a3d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1467,38 +1467,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1507,18 +1507,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX942-AGPRCD: ; %bb.0: ; %bb @@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1606,11 +1606,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1621,7 +1621,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1664,11 +1664,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1679,7 +1679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1869,38 +1869,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1909,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX942-AGPRCD: ; %bb.0: ; %bb @@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -2008,11 +2008,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -2066,11 +2066,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2081,7 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -2275,21 +2275,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2322,21 +2322,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -2495,15 +2495,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2512,7 +2512,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -2560,15 +2560,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -2789,21 +2789,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2836,21 +2836,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3000,21 +3000,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -3047,21 +3047,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3211,21 +3211,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3258,21 +3258,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3422,21 +3422,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3469,21 +3469,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3642,15 +3642,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3659,7 +3659,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -3707,15 +3707,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3724,7 +3724,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -3945,15 +3945,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3962,7 +3962,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4010,15 +4010,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4027,7 +4027,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4248,15 +4248,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4265,7 +4265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4313,15 +4313,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4330,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4551,15 +4551,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4568,7 +4568,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4616,15 +4616,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 033a35f69a0bd..68e3afe8b449a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 -; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 +; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 +; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 +; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -41,40 +41,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -88,15 +87,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 -; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 +; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 +; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 +; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -114,40 +113,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) @@ -160,22 +158,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 11 @@ -204,22 +202,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 753206206180a..03bf33e0d17e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -12,29 +12,45 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, ; -------------------------------------------------------------------- define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_16x16x32_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -74,29 +90,45 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg } define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 @@ -382,15 +414,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -408,40 +440,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -449,15 +480,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -473,34 +504,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -508,15 +538,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -534,40 +564,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -575,15 +604,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -593,40 +622,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16: @@ -765,15 +794,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -791,40 +820,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -832,15 +860,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -856,34 +884,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -891,15 +918,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -917,40 +944,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -958,15 +984,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -976,40 +1002,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1 ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags: @@ -1144,65 +1170,105 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < } define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; HEURRC-NEXT: s_nop 11 @@ -1314,65 +1380,105 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half } define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 11 @@ -2536,29 +2642,45 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_16x16x64_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_16x16x64_i8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_16x16x64_i8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -2598,29 +2720,45 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 } define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 @@ -3035,15 +3173,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3059,34 +3197,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3447,15 +3584,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3471,34 +3608,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3784,65 +3920,105 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; HEURRC-NEXT: s_nop 11 @@ -3954,65 +4130,105 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 11 @@ -5299,10 +5515,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-LABEL: test_mfma_f32_16x16x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; GCN-NEXT: s_nop 7 @@ -5315,10 +5531,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -5361,10 +5577,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 @@ -5377,10 +5593,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll index d24f1f0b526c3..c1946630ef5f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll @@ -52,27 +52,26 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a3 +; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_32x32x8i8: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af96bb8b9..3d9ebf91e8f47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -99,59 +99,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -234,59 +234,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -510,25 +510,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm @@ -577,25 +577,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm @@ -864,22 +864,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -931,22 +931,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -1257,59 +1257,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -1396,59 +1396,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -1690,25 +1690,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm @@ -1760,25 +1760,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm @@ -2080,22 +2080,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -2150,22 +2150,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2482,7 +2482,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2500,53 +2500,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2603,7 +2617,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2612,7 +2626,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2621,85 +2635,99 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 -; LIT-SRCC-NEXT: s_endpgm -; -; GFX90A-LABEL: test_mfma_i32_32x32x4i8: -; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 -; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_i32_32x32x4i8: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 @@ -2843,134 +2871,134 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8: @@ -3095,30 +3123,37 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3126,30 +3161,33 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3594,59 +3632,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -3730,59 +3768,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -4011,22 +4049,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -4078,22 +4116,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -4440,32 +4478,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4478,28 +4516,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -4584,32 +4622,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4621,33 +4659,31 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat: @@ -4751,60 +4787,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4814,55 +4850,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:80 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:64 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -5055,32 +5091,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -5109,32 +5145,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -5277,60 +5313,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -5376,60 +5412,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -5880,40 +5916,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -5975,40 +6011,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index aae14c8cc87b3..52dcfb735a899 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -14,21 +14,37 @@ ; fp8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -37,21 +53,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -60,21 +92,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -83,21 +131,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -106,21 +170,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -129,21 +209,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -152,21 +248,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -175,21 +287,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -199,21 +327,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -223,21 +367,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; fp8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -247,21 +407,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -271,21 +447,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -295,21 +487,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -319,21 +527,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; fp8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -343,21 +567,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -367,21 +607,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; fp8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -391,21 +647,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -415,21 +687,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; bf8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -439,21 +727,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -463,21 +767,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; bf8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp @@ -488,21 +808,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp @@ -512,21 +848,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; bf8 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -535,21 +887,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -559,21 +927,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; bf8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -583,21 +967,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -607,21 +1007,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; bf8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -631,21 +1047,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -655,21 +1087,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; fp6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -679,21 +1127,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -703,21 +1167,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; fp6 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -727,21 +1207,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -751,21 +1247,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; fp6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -775,45 +1287,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } ; fp6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -823,21 +1367,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -848,21 +1408,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; bf6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -872,21 +1448,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -896,21 +1488,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; bf6 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -920,21 +1528,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -944,21 +1568,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; bf6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -968,21 +1608,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -992,45 +1648,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; bf6 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, - i32 3, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -; This should be optimized to avoid the scale -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -1040,21 +1728,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; bf6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -1064,21 +1768,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -1088,21 +1808,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; fp6 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1112,21 +1848,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1136,21 +1888,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; fp4 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1160,21 +1928,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1184,21 +1968,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; fp4 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -1208,21 +2008,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -1232,21 +2048,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; fp4 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -1256,21 +2088,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -1280,21 +2128,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; fp4 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -1304,21 +2168,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -1328,21 +2208,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; fp4 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -1352,21 +2248,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -1379,97 +2291,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; -------------------------------------------------------------------- define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v14, s0 -; SDAG-NEXT: v_mov_b32_e32 v15, s1 -; SDAG-NEXT: v_mov_b32_e32 v16, s2 -; SDAG-NEXT: v_mov_b32_e32 v17, s3 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_mov_b32_e32 v10, s24 -; SDAG-NEXT: v_mov_b32_e32 v11, s25 -; SDAG-NEXT: v_mov_b32_e32 v12, s26 -; SDAG-NEXT: v_mov_b32_e32 v13, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1477,29 +2309,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v20, s28 -; GISEL-NEXT: v_mov_b32_e32 v21, s29 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1510,22 +2330,162 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v14, s0 -; SDAG-NEXT: v_mov_b32_e32 v15, s1 -; SDAG-NEXT: v_mov_b32_e32 v16, s2 -; SDAG-NEXT: v_mov_b32_e32 v17, s3 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v12, s0 +; SDAG-NEXT: v_mov_b32_e32 v13, s1 +; SDAG-NEXT: v_mov_b32_e32 v14, s2 +; SDAG-NEXT: v_mov_b32_e32 v15, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v8, s24 +; SDAG-NEXT: v_mov_b32_e32 v9, s25 +; SDAG-NEXT: v_mov_b32_e32 v10, s26 +; SDAG-NEXT: v_mov_b32_e32 v11, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] @@ -1576,10 +2536,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -1622,6 +2582,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -1630,10 +2594,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -1652,13 +2612,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -1751,14 +2711,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v20, -2 -; SDAG-NEXT: v_mov_b32_e32 v21, 33 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 33 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1791,14 +2751,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v20, -2 -; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1831,14 +2791,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d -; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2185,58 +3145,328 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } ; This should be optimized to avoid the scale, with non-0 op_sel arguments. define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 1 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 1 +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <4 x float> %result +} + +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v20, 1 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2244,39 +3474,38 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v17, 1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 1 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2284,162 +3513,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 1 -; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) - ret <4 x float> %result -} - -; -------------------------------------------------------------------- -; Incorrect signature for format cases (IR vector too large) -; -------------------------------------------------------------------- - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2448,21 +3536,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -2471,21 +3575,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -2494,21 +3614,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -2517,21 +3653,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..7b7865e3434db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -17,27 +17,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -61,11 +61,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -81,7 +81,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -112,27 +112,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -156,11 +156,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -176,7 +176,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -207,27 +207,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -251,11 +251,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -271,7 +271,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -302,27 +302,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -346,11 +346,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -366,7 +366,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -397,27 +397,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -441,11 +441,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -461,7 +461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -492,27 +492,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -536,11 +536,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -556,7 +556,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -587,27 +587,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -631,11 +631,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -651,7 +651,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -682,27 +682,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -726,11 +726,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -746,7 +746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -775,47 +775,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -828,27 +870,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -872,11 +914,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -892,7 +934,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -920,47 +962,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -970,325 +1054,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp8 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf8 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1309,30 +1097,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1352,88 +1139,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 0, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 0, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf8 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1454,30 +1188,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1497,1992 +1228,2311 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 1, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 1, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf8 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz + i32 0, ; cbsz i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; fp8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz + i32 0, ; cbsz i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz + i32 0, ; cbsz i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; fp8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz + i32 0, ; cbsz i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz + i32 0, ; cbsz i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz +; bf8 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz +; bf8 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 1, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp6 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 3, ; blgp +; bf8 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 3, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } - -; bf6 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 0, ; blgp +; bf8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 0, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 1, ; blgp +; bf8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 1, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; bf6 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 2, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 4, ; blgp +; fp6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 4, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 3, ; blgp +; fp6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 3, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, +; fp6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 4, ; blgp + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp4 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 0, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp4 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 1, ; blgp +; fp6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 1, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp4 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 2, ; blgp + +; bf6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 2, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp4 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 3, ; blgp +; bf6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp4 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; -------------------------------------------------------------------- -; Different input parameter classes -; -------------------------------------------------------------------- - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s20 -; SDAG-NEXT: v_mov_b32_e32 v19, s21 -; SDAG-NEXT: v_mov_b32_e32 v20, s22 -; SDAG-NEXT: v_mov_b32_e32 v21, s23 -; SDAG-NEXT: v_mov_b32_e32 v22, s24 -; SDAG-NEXT: v_mov_b32_e32 v23, s25 -; SDAG-NEXT: v_mov_b32_e32 v24, s26 -; SDAG-NEXT: v_mov_b32_e32 v25, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -3503,41 +3553,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -3557,43 +3593,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; bf6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3612,38 +3642,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3661,43 +3681,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3716,38 +3729,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3765,43 +3768,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; bf6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3820,38 +3817,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3869,98 +3856,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s16 -; GCN-NEXT: v_accvgpr_write_b32 a5, s17 -; GCN-NEXT: v_accvgpr_write_b32 a6, s18 -; GCN-NEXT: v_accvgpr_write_b32 a7, s19 -; GCN-NEXT: v_accvgpr_write_b32 a8, s20 -; GCN-NEXT: v_accvgpr_write_b32 a9, s21 -; GCN-NEXT: v_accvgpr_write_b32 a10, s22 -; GCN-NEXT: v_accvgpr_write_b32 a11, s23 -; GCN-NEXT: v_accvgpr_write_b32 a12, s24 -; GCN-NEXT: v_accvgpr_write_b32 a13, s25 -; GCN-NEXT: v_accvgpr_write_b32 a14, s26 -; GCN-NEXT: v_accvgpr_write_b32 a15, s27 -; GCN-NEXT: v_mov_b32_e32 v17, s28 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v30, s16 -; SDAG-NEXT: v_mov_b32_e32 v31, s17 -; SDAG-NEXT: v_mov_b32_e32 v32, s18 -; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_mov_b32_e32 v24, s28 -; SDAG-NEXT: v_mov_b32_e32 v25, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3979,47 +3904,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v24, s20 -; GISEL-NEXT: v_mov_b32_e32 v25, s21 -; GISEL-NEXT: v_mov_b32_e32 v26, s22 -; GISEL-NEXT: v_mov_b32_e32 v27, s23 -; GISEL-NEXT: v_mov_b32_e32 v28, s24 -; GISEL-NEXT: v_mov_b32_e32 v29, s25 -; GISEL-NEXT: v_mov_b32_e32 v30, s26 -; GISEL-NEXT: v_mov_b32_e32 v31, s27 -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v31 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4037,37 +3943,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; bf6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 33 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4086,32 +3992,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 33 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4129,37 +4031,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4178,32 +4079,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4221,37 +4118,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: +; fp6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4270,32 +4167,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4313,37 +4206,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4362,32 +4254,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4405,35 +4293,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; fp4 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -4454,30 +4343,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494 -; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -4497,35 +4383,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -4546,30 +4432,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -4589,224 +4472,2337 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; fp4 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; SDAG-NEXT: s_endpgm +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_mov_b32_e32 v32, s0 -; GISEL-NEXT: v_mov_b32_e32 v33, s1 +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) - store <16 x float> %result, ptr addrspace(1) %ptr, align 64 - ret void +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v32, -2 -; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 ; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; SDAG-NEXT: s_endpgm +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v33, -2 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 ; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) - store <16 x float> %result, ptr addrspace(1) %ptr, align 64 - ret void +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: +; fp4 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s16 -; SDAG-NEXT: v_mov_b32_e32 v7, s17 -; SDAG-NEXT: v_mov_b32_e32 v8, s18 -; SDAG-NEXT: v_mov_b32_e32 v9, s19 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: v_mov_b32_e32 v14, s24 -; SDAG-NEXT: v_mov_b32_e32 v15, s25 -; SDAG-NEXT: v_mov_b32_e32 v16, s26 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; -------------------------------------------------------------------- +; Different input parameter classes +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v17, s0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v17, s0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v17, s0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v17, s0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v14, s16 +; SDAG-NEXT: v_mov_b32_e32 v15, s17 +; SDAG-NEXT: v_mov_b32_e32 v16, s18 +; SDAG-NEXT: v_mov_b32_e32 v17, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v14, s16 +; SDAG-NEXT: v_mov_b32_e32 v15, s17 +; SDAG-NEXT: v_mov_b32_e32 v16, s18 +; SDAG-NEXT: v_mov_b32_e32 v17, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v14, s16 +; SDAG-NEXT: v_mov_b32_e32 v15, s17 +; SDAG-NEXT: v_mov_b32_e32 v16, s18 +; SDAG-NEXT: v_mov_b32_e32 v17, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] +; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_accvgpr_write_b32 a4, s16 +; GCN-NEXT: v_accvgpr_write_b32 a5, s17 +; GCN-NEXT: v_accvgpr_write_b32 a6, s18 +; GCN-NEXT: v_accvgpr_write_b32 a7, s19 +; GCN-NEXT: v_accvgpr_write_b32 a8, s20 +; GCN-NEXT: v_accvgpr_write_b32 a9, s21 +; GCN-NEXT: v_accvgpr_write_b32 a10, s22 +; GCN-NEXT: v_accvgpr_write_b32 a11, s23 +; GCN-NEXT: v_accvgpr_write_b32 a12, s24 +; GCN-NEXT: v_accvgpr_write_b32 a13, s25 +; GCN-NEXT: v_accvgpr_write_b32 a14, s26 +; GCN-NEXT: v_accvgpr_write_b32 a15, s27 +; GCN-NEXT: v_mov_b32_e32 v17, s28 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s24 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s25 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s26 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s28 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s29 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s24 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s27 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s28 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 33 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 33 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 1.0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0.15915494 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <16 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s0 +; GISEL-NEXT: v_mov_b32_e32 v33, s1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 ; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4846,18 +6842,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4873,36 +6869,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b32_e32 v20, s0 -; GISEL-NEXT: v_mov_b32_e32 v21, s1 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b32_e32 v4, s0 +; GISEL-NEXT: v_mov_b32_e32 v5, s1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) @@ -4996,19 +6992,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v20, 25 -; GISEL-NEXT: v_mov_b32_e32 v21, 42 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b32_e32 v4, 25 +; GISEL-NEXT: v_mov_b32_e32 v5, 42 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -5024,34 +7020,293 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 14 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v32, 42 +; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b32_e32 v32, 25 +; GISEL-NEXT: v_mov_b32_e32 v33, 42 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -5060,383 +7315,586 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ret void } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v44, s24 -; SDAG-NEXT: v_mov_b32_e32 v45, s25 -; SDAG-NEXT: v_mov_b32_e32 v46, s26 -; SDAG-NEXT: v_mov_b32_e32 v47, s27 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 1 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mov_b32_e32 v17, 1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v32, 42 -; SDAG-NEXT: v_mov_b32_e32 v33, 25 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 25 -; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mov_b32_e32 v16, 1 +; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1 -; SDAG-NEXT: v_mov_b32_e32 v32, 0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5455,14 +7913,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0 -; GISEL-NEXT: v_mov_b32_e32 v32, 1 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -5478,9 +7936,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5498,37 +7955,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 0 -; SDAG-NEXT: v_mov_b32_e32 v32, 1 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5547,12 +8004,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 1 -; GISEL-NEXT: v_mov_b32_e32 v32, 0 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -5570,9 +8025,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5590,39 +8044,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; -------------------------------------------------------------------- -; Incorrect signature for format cases (IR vector too large) -; -------------------------------------------------------------------- - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5643,14 +8096,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -5666,7 +8119,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5688,36 +8141,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz - i32 2, ; blgp + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5738,14 +8191,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -5761,7 +8214,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5782,38 +8235,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz + i32 4, ; cbsz i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5832,31 +8285,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5874,86 +8327,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 +; SDAG-NEXT: scratch_load_dword v14, off, s32 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5974,30 +8378,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 +; GISEL-NEXT: scratch_load_dword v14, off, s32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -6017,40 +8420,39 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6069,14 +8471,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -6092,9 +8494,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6114,136 +8515,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6263,12 +8562,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -6286,7 +8583,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6305,54 +8602,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 6eb9449069a52..c2b7e51c43bc8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -149,19 +149,19 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -247,168 +247,151 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -425,104 +408,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -779,53 +664,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, s0 -; GCN-NEXT: v_mov_b32_e32 v37, s1 -; GCN-NEXT: v_mov_b32_e32 v38, s2 -; GCN-NEXT: v_mov_b32_e32 v39, s3 -; GCN-NEXT: v_mov_b32_e32 v13, s25 -; GCN-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NEXT: v_mov_b32_e32 v16, s28 -; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: v_mov_b32_e32 v28, s16 -; GCN-NEXT: v_mov_b32_e32 v29, s17 -; GCN-NEXT: v_mov_b32_e32 v30, s18 -; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NEXT: v_mov_b32_e32 v26, s0 +; GCN-NEXT: v_mov_b32_e32 v27, s1 +; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: v_mov_b32_e32 v16, v10 +; GCN-NEXT: v_mov_b32_e32 v15, v9 +; GCN-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NEXT: v_mov_b32_e32 v13, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NEXT: v_mov_b32_e32 v4, s28 +; GCN-NEXT: v_mov_b32_e32 v5, s29 +; GCN-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NEXT: v_mov_b32_e32 v19, s17 +; GCN-NEXT: v_mov_b32_e32 v20, s18 +; GCN-NEXT: v_mov_b32_e32 v21, s19 +; GCN-NEXT: v_mov_b32_e32 v22, s20 +; GCN-NEXT: v_mov_b32_e32 v23, s21 +; GCN-NEXT: v_mov_b32_e32 v24, s22 +; GCN-NEXT: v_mov_b32_e32 v25, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -982,19 +851,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result @@ -1086,44 +955,151 @@ bb: } define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: +; GCN-LABEL: test_smfmac_i32_32x32x64_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) + ret <16 x i32> %result +} + +define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { +; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) + ret <16 x i32> %result +} + +define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { +; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x i32> %result +} + +define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: +; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1140,228 +1116,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result @@ -1518,19 +1272,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1687,19 +1441,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1856,19 +1610,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -2025,19 +1779,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -2129,168 +1883,151 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2307,104 +2044,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2496,168 +2135,151 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) - ret <16 x float> %result -} +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) + ret <16 x float> %result +} define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2674,104 +2296,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2863,168 +2387,151 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -3041,104 +2548,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3230,168 +2639,151 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, v16 -; GISEL-NEXT: v_mov_b32_e32 v5, v17 -; GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GISEL-NEXT: v_mov_b32_e32 v7, v19 -; GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GISEL-NEXT: v_mov_b32_e32 v9, v21 -; GISEL-NEXT: v_mov_b32_e32 v10, v22 -; GISEL-NEXT: v_mov_b32_e32 v11, v23 -; GISEL-NEXT: v_mov_b32_e32 v12, v24 -; GISEL-NEXT: v_mov_b32_e32 v13, v25 -; GISEL-NEXT: v_mov_b32_e32 v14, v26 -; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v12, s24 +; GISEL-NEXT: v_mov_b32_e32 v13, s25 +; GISEL-NEXT: v_mov_b32_e32 v14, s26 +; GISEL-NEXT: v_mov_b32_e32 v15, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -3408,104 +2800,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v12, v1 -; GISEL-NEXT: v_mov_b32_e32 v13, v2 -; GISEL-NEXT: v_mov_b32_e32 v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v15, v4 -; GISEL-NEXT: v_mov_b32_e32 v16, v5 -; GISEL-NEXT: v_mov_b32_e32 v17, v6 -; GISEL-NEXT: v_mov_b32_e32 v18, v7 -; GISEL-NEXT: v_mov_b32_e32 v19, v8 -; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] -; GISEL-NEXT: v_mov_b32_e32 v21, v10 -; GISEL-NEXT: v_mov_b32_e32 v0, s24 -; GISEL-NEXT: v_mov_b32_e32 v1, s25 -; GISEL-NEXT: v_mov_b32_e32 v2, s26 -; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 -; GISEL-NEXT: v_mov_b32_e32 v5, s29 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v6, v11 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mov_b32_e32 v8, v13 -; GISEL-NEXT: v_mov_b32_e32 v9, v14 -; GISEL-NEXT: v_mov_b32_e32 v10, v15 -; GISEL-NEXT: v_mov_b32_e32 v11, v16 -; GISEL-NEXT: v_mov_b32_e32 v12, v17 -; GISEL-NEXT: v_mov_b32_e32 v13, v18 -; GISEL-NEXT: v_mov_b32_e32 v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v15, v20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 4366472c73a0e..d3e171be10802 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -246,7 +246,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 -; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 @@ -280,7 +279,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 -; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 @@ -420,7 +418,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 -; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 @@ -454,7 +451,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 -; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 0191a85b33888..5b72e006072df 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -193,7 +193,8 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[8:11], s12 idxen offen glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -202,6 +203,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -227,7 +229,8 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[4:7], s8 idxen offen sc0 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -236,6 +239,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -339,7 +343,8 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[8:11], s12 idxen offen glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -348,6 +353,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -373,7 +379,8 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[4:7], s8 idxen offen sc0 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -382,6 +389,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 9dac2393fd966..1c04ff3e83326 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -85,7 +85,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt +; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -96,9 +96,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt +; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_load_store: @@ -115,7 +115,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt +; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -126,9 +126,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt +; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_load_store: @@ -413,7 +413,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -424,9 +424,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -443,7 +443,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -454,9 +454,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 9585c486aeb9e..3c4a29c54928d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90ADAG,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90AGSEL,GFX90A %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942DAG,GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942GSEL,GFX942 %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) @@ -86,62 +86,254 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_vgpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: v_mov_b32_e32 v0, s16 +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, s17 +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, s18 +; GFX90ADAG-NEXT: v_mov_b32_e32 v3, s19 +; GFX90ADAG-NEXT: v_mov_b32_e32 v4, s20 +; GFX90ADAG-NEXT: v_mov_b32_e32 v5, s21 +; GFX90ADAG-NEXT: v_mov_b32_e32 v6, s22 +; GFX90ADAG-NEXT: v_mov_b32_e32 v7, s23 +; GFX90ADAG-NEXT: v_mov_b32_e32 v8, s24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v9, s25 +; GFX90ADAG-NEXT: v_mov_b32_e32 v10, s26 +; GFX90ADAG-NEXT: v_mov_b32_e32 v11, s27 +; GFX90ADAG-NEXT: v_mov_b32_e32 v12, s28 +; GFX90ADAG-NEXT: v_mov_b32_e32 v13, s29 +; GFX90ADAG-NEXT: v_mov_b32_e32 v14, s30 +; GFX90ADAG-NEXT: v_mov_b32_e32 v15, s31 +; GFX90ADAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX90ADAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX90ADAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX90ADAG-NEXT: v_mov_b32_e32 v20, s4 +; GFX90ADAG-NEXT: v_mov_b32_e32 v21, s5 +; GFX90ADAG-NEXT: v_mov_b32_e32 v22, s6 +; GFX90ADAG-NEXT: v_mov_b32_e32 v23, s7 +; GFX90ADAG-NEXT: v_mov_b32_e32 v24, s8 +; GFX90ADAG-NEXT: v_mov_b32_e32 v25, s9 +; GFX90ADAG-NEXT: v_mov_b32_e32 v26, s10 +; GFX90ADAG-NEXT: v_mov_b32_e32 v27, s11 +; GFX90ADAG-NEXT: v_mov_b32_e32 v28, s12 +; GFX90ADAG-NEXT: v_mov_b32_e32 v29, s13 +; GFX90ADAG-NEXT: v_mov_b32_e32 v30, s14 +; GFX90ADAG-NEXT: v_mov_b32_e32 v31, s15 +; GFX90ADAG-NEXT: s_nop 1 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1] +; GFX90AGSEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1] +; GFX90AGSEL-NEXT: s_nop 1 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0 +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 1 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_vgpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: v_mov_b32_e32 v0, s16 +; GFX942DAG-NEXT: v_mov_b32_e32 v1, s17 +; GFX942DAG-NEXT: v_mov_b32_e32 v2, s18 +; GFX942DAG-NEXT: v_mov_b32_e32 v3, s19 +; GFX942DAG-NEXT: v_mov_b32_e32 v4, s20 +; GFX942DAG-NEXT: v_mov_b32_e32 v5, s21 +; GFX942DAG-NEXT: v_mov_b32_e32 v6, s22 +; GFX942DAG-NEXT: v_mov_b32_e32 v7, s23 +; GFX942DAG-NEXT: v_mov_b32_e32 v8, s24 +; GFX942DAG-NEXT: v_mov_b32_e32 v9, s25 +; GFX942DAG-NEXT: v_mov_b32_e32 v10, s26 +; GFX942DAG-NEXT: v_mov_b32_e32 v11, s27 +; GFX942DAG-NEXT: v_mov_b32_e32 v12, s28 +; GFX942DAG-NEXT: v_mov_b32_e32 v13, s29 +; GFX942DAG-NEXT: v_mov_b32_e32 v14, s30 +; GFX942DAG-NEXT: v_mov_b32_e32 v15, s31 +; GFX942DAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX942DAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX942DAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX942DAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX942DAG-NEXT: v_mov_b32_e32 v20, s4 +; GFX942DAG-NEXT: v_mov_b32_e32 v21, s5 +; GFX942DAG-NEXT: v_mov_b32_e32 v22, s6 +; GFX942DAG-NEXT: v_mov_b32_e32 v23, s7 +; GFX942DAG-NEXT: v_mov_b32_e32 v24, s8 +; GFX942DAG-NEXT: v_mov_b32_e32 v25, s9 +; GFX942DAG-NEXT: v_mov_b32_e32 v26, s10 +; GFX942DAG-NEXT: v_mov_b32_e32 v27, s11 +; GFX942DAG-NEXT: v_mov_b32_e32 v28, s12 +; GFX942DAG-NEXT: v_mov_b32_e32 v29, s13 +; GFX942DAG-NEXT: v_mov_b32_e32 v30, s14 +; GFX942DAG-NEXT: v_mov_b32_e32 v31, s15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[24:25], s[24:25] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[26:27], s[26:27] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[28:29], s[28:29] +; GFX942GSEL-NEXT: v_mov_b64_e32 v[30:31], s[30:31] +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] +; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0 +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 0 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -228,62 +420,286 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_agpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90ADAG-NEXT: s_nop 1 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_agpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s16 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s17 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s18 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s19 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s20 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s21 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s22 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s23 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s24 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s25 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s26 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s27 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s28 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s29 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s30 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s31 +; GFX90AGSEL-NEXT: s_nop 1 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 1 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_agpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_agpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s16 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s17 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s18 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s19 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s20 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s21 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s22 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s23 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s24 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s25 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s26 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s27 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s28 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s29 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s30 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s31 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 0 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -347,40 +763,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -389,6 +805,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX90ADAG-NEXT: ;;#ASMSTART +; GFX90ADAG-NEXT: ; def a0 +; GFX90ADAG-NEXT: ;;#ASMEND +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90AGSEL-NEXT: ;;#ASMSTART +; GFX90AGSEL-NEXT: ; def a0 +; GFX90AGSEL-NEXT: ;;#ASMEND +; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942DAG-NEXT: ;;#ASMSTART +; GFX942DAG-NEXT: ; def a0 +; GFX942DAG-NEXT: ;;#ASMEND +; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942GSEL-NEXT: ;;#ASMSTART +; GFX942GSEL-NEXT: ; def a0 +; GFX942GSEL-NEXT: ;;#ASMEND +; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -453,40 +997,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -495,6 +1039,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX90ADAG-NEXT: ;;#ASMSTART +; GFX90ADAG-NEXT: ; use a[100:131] +; GFX90ADAG-NEXT: ;;#ASMEND +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX90AGSEL-NEXT: ;;#ASMSTART +; GFX90AGSEL-NEXT: ; use a[100:131] +; GFX90AGSEL-NEXT: ;;#ASMEND +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942DAG-NEXT: ;;#ASMSTART +; GFX942DAG-NEXT: ; use a[100:131] +; GFX942DAG-NEXT: ;;#ASMEND +; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942GSEL-NEXT: ;;#ASMSTART +; GFX942GSEL-NEXT: ; use a[100:131] +; GFX942GSEL-NEXT: ;;#ASMEND +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -559,40 +1231,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -601,6 +1273,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0 +; GFX90ADAG-NEXT: ;;#ASMSTART +; GFX90ADAG-NEXT: ; def v0 +; GFX90ADAG-NEXT: ;;#ASMEND +; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90AGSEL-NEXT: ;;#ASMSTART +; GFX90AGSEL-NEXT: ; def v0 +; GFX90AGSEL-NEXT: ;;#ASMEND +; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX90AGSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0 +; GFX942DAG-NEXT: ;;#ASMSTART +; GFX942DAG-NEXT: ; def v0 +; GFX942DAG-NEXT: ;;#ASMEND +; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942GSEL-NEXT: ;;#ASMSTART +; GFX942GSEL-NEXT: ; def v0 +; GFX942GSEL-NEXT: ;;#ASMEND +; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX942GSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={v0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -687,40 +1487,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112 ; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64 ; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80 @@ -729,6 +1529,205 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35] ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90ADAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90ADAG-NEXT: s_mov_b32 s38, -1 +; GFX90ADAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90ADAG-NEXT: s_add_u32 s36, s36, s11 +; GFX90ADAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX90ADAG-NEXT: s_mov_b32 s12, s8 +; GFX90ADAG-NEXT: s_add_u32 s8, s4, 44 +; GFX90ADAG-NEXT: s_mov_b32 s13, s9 +; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90ADAG-NEXT: s_getpc_b64 s[4:5] +; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX90ADAG-NEXT: s_mov_b32 s14, s10 +; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90ADAG-NEXT: s_mov_b32 s32, 0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v40, 0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35] +; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: s_nop 0 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35] +; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX90AGSEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX90AGSEL-NEXT: s_mov_b32 s38, -1 +; GFX90AGSEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX90AGSEL-NEXT: s_add_u32 s36, s36, s11 +; GFX90AGSEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX90AGSEL-NEXT: s_mov_b32 s16, s8 +; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 44 +; GFX90AGSEL-NEXT: s_mov_b32 s15, s9 +; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX90AGSEL-NEXT: s_mov_b64 s[12:13], s[0:1] +; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1] +; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 +; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90AGSEL-NEXT: s_mov_b32 s14, s10 +; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[12:13] +; GFX90AGSEL-NEXT: s_mov_b32 s12, s16 +; GFX90AGSEL-NEXT: s_mov_b32 s13, s15 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX90AGSEL-NEXT: s_mov_b32 s32, 0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35] +; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: s_nop 0 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_mov_b32 s12, s8 +; GFX942DAG-NEXT: s_add_u32 s8, s4, 44 +; GFX942DAG-NEXT: s_mov_b32 s13, s9 +; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942DAG-NEXT: s_getpc_b64 s[4:5] +; GFX942DAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942DAG-NEXT: s_mov_b32 s14, s10 +; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX942DAG-NEXT: s_mov_b32 s32, 0 +; GFX942DAG-NEXT: v_mov_b32_e32 v40, 0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35] +; GFX942DAG-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: s_nop 0 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35] +; GFX942DAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_mov_b32 s12, s8 +; GFX942GSEL-NEXT: s_add_u32 s8, s4, 44 +; GFX942GSEL-NEXT: s_mov_b32 s13, s9 +; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942GSEL-NEXT: s_getpc_b64 s[4:5] +; GFX942GSEL-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX942GSEL-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942GSEL-NEXT: s_mov_b32 s14, s10 +; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX942GSEL-NEXT: s_mov_b32 s32, 0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35] +; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112 +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: s_nop 0 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942GSEL-NEXT: s_endpgm bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -831,59 +1830,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16 ; GFX908-NEXT: s_cbranch_scc1 .LBB6_2 @@ -906,6 +1905,331 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: .LBB6_2: ; %bb3 ; GFX908-NEXT: s_endpgm +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: +; GFX90ADAG: ; %bb.0: ; %bb1 +; GFX90ADAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX90ADAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX90ADAG-NEXT: s_mov_b32 s54, -1 +; GFX90ADAG-NEXT: s_mov_b32 s55, 0xe00000 +; GFX90ADAG-NEXT: s_add_u32 s52, s52, s11 +; GFX90ADAG-NEXT: s_mov_b32 s14, s10 +; GFX90ADAG-NEXT: s_mov_b32 s12, s8 +; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX90ADAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90ADAG-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX90ADAG-NEXT: s_addc_u32 s53, s53, 0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 +; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX90ADAG-NEXT: s_bitcmp0_b32 s8, 0 +; GFX90ADAG-NEXT: s_mov_b32 s32, 0 +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s36 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s37 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s38 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s39 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s40 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s41 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s42 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s43 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s44 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s45 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s46 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s47 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s48 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s49 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s50 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s51 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s16 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s17 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s18 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s19 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s20 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s21 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s22 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s23 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s24 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s25 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s26 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s27 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s28 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s29 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s30 +; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s31 +; GFX90ADAG-NEXT: s_nop 1 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 +; GFX90ADAG-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX90ADAG-NEXT: ; %bb.1: ; %bb2 +; GFX90ADAG-NEXT: s_add_u32 s8, s4, 48 +; GFX90ADAG-NEXT: s_mov_b32 s13, s9 +; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX90ADAG-NEXT: s_getpc_b64 s[4:5] +; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX90ADAG-NEXT: .LBB6_2: ; %bb3 +; GFX90ADAG-NEXT: s_endpgm +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: +; GFX90AGSEL: ; %bb.0: ; %bb1 +; GFX90AGSEL-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX90AGSEL-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX90AGSEL-NEXT: s_mov_b32 s70, -1 +; GFX90AGSEL-NEXT: s_mov_b32 s71, 0xe00000 +; GFX90AGSEL-NEXT: s_add_u32 s68, s68, s11 +; GFX90AGSEL-NEXT: s_mov_b32 s14, s10 +; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX90AGSEL-NEXT: s_mov_b64 s[16:17], s[0:1] +; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90AGSEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; GFX90AGSEL-NEXT: s_load_dwordx16 s[52:67], s[0:1], 0x40 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90AGSEL-NEXT: s_addc_u32 s69, s69, 0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s52 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s53 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s54 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s55 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s56 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s57 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s58 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s59 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s60 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s61 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s62 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s63 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s64 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s65 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s66 +; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s67 +; GFX90AGSEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX90AGSEL-NEXT: s_and_b32 s2, s2, 1 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX90AGSEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX90AGSEL-NEXT: s_mov_b32 s32, 0 +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[0:1] offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[0:1] offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[0:1] offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[0:1] offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[0:1] offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[0:1] offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[0:1] offset:112 +; GFX90AGSEL-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX90AGSEL-NEXT: ; %bb.1: ; %bb2 +; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1] +; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 +; GFX90AGSEL-NEXT: s_mov_b32 s12, s8 +; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 48 +; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX90AGSEL-NEXT: s_mov_b32 s13, s9 +; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[70:71] +; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[16:17] +; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX90AGSEL-NEXT: .LBB6_2: ; %bb3 +; GFX90AGSEL-NEXT: s_endpgm +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: +; GFX942DAG: ; %bb.0: ; %bb1 +; GFX942DAG-NEXT: s_mov_b32 s14, s10 +; GFX942DAG-NEXT: s_mov_b32 s12, s8 +; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942DAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942DAG-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 +; GFX942DAG-NEXT: s_bitcmp0_b32 s8, 0 +; GFX942DAG-NEXT: s_mov_b32 s32, 0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s36 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s37 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s38 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s39 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s40 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s41 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s42 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s43 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s44 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s45 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s46 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s47 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s48 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s49 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s50 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s51 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s16 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s17 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s18 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s19 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s20 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s21 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s22 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s23 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s24 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s25 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s26 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s27 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s28 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s29 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s30 +; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s31 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942DAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 +; GFX942DAG-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX942DAG-NEXT: ; %bb.1: ; %bb2 +; GFX942DAG-NEXT: s_add_u32 s8, s4, 48 +; GFX942DAG-NEXT: s_mov_b32 s13, s9 +; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX942DAG-NEXT: s_getpc_b64 s[4:5] +; GFX942DAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942DAG-NEXT: .LBB6_2: ; %bb3 +; GFX942DAG-NEXT: s_endpgm +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: +; GFX942GSEL: ; %bb.0: ; %bb1 +; GFX942GSEL-NEXT: s_mov_b32 s14, s10 +; GFX942GSEL-NEXT: s_mov_b32 s12, s8 +; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942GSEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942GSEL-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x0 +; GFX942GSEL-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x40 +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942GSEL-NEXT: s_xor_b32 s8, s8, 1 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s36 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s37 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s38 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s39 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s40 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s41 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s42 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s43 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s44 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s45 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s46 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s47 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s48 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s49 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s50 +; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s51 +; GFX942GSEL-NEXT: s_and_b32 s8, s8, 1 +; GFX942GSEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX942GSEL-NEXT: s_mov_b32 s32, 0 +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 +; GFX942GSEL-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX942GSEL-NEXT: ; %bb.1: ; %bb2 +; GFX942GSEL-NEXT: s_getpc_b64 s[6:7] +; GFX942GSEL-NEXT: s_add_u32 s6, s6, foo@gotpcrel32@lo+4 +; GFX942GSEL-NEXT: s_addc_u32 s7, s7, foo@gotpcrel32@hi+12 +; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX942GSEL-NEXT: s_add_u32 s8, s4, 48 +; GFX942GSEL-NEXT: s_mov_b32 s13, s9 +; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942GSEL-NEXT: .LBB6_2: ; %bb3 +; GFX942GSEL-NEXT: s_endpgm bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -972,40 +2296,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 @@ -1015,6 +2339,122 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90ADAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v35, 2.0 +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: s_nop 0 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90AGSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX90AGSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v35, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: s_nop 0 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942DAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX942DAG-NEXT: v_mov_b32_e32 v34, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v35, 2.0 +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: s_nop 0 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942GSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX942GSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v35, 2.0 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: s_nop 0 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -1073,40 +2513,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 @@ -1116,6 +2556,122 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: +; GFX90ADAG: ; %bb.0: ; %bb +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 +; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 +; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 +; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 +; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 +; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 +; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 +; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off +; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: s_nop 0 +; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GFX90ADAG-NEXT: s_nop 15 +; GFX90ADAG-NEXT: s_nop 2 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off +; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 +; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) +; GFX90ADAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: +; GFX90AGSEL: ; %bb.0: ; %bb +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off +; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 +; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX90AGSEL-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: s_nop 0 +; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GFX90AGSEL-NEXT: s_nop 15 +; GFX90AGSEL-NEXT: s_nop 2 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 +; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 +; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) +; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: +; GFX942DAG: ; %bb.0: ; %bb +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 +; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 +; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 +; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 +; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 +; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 +; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 +; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off +; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: s_nop 0 +; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] +; GFX942DAG-NEXT: s_nop 15 +; GFX942DAG-NEXT: s_nop 1 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off +; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 +; GFX942DAG-NEXT: s_waitcnt vmcnt(0) +; GFX942DAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: +; GFX942GSEL: ; %bb.0: ; %bb +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off +; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 +; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 +; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 +; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 +; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 +; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 +; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 +; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942GSEL-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: s_nop 0 +; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] +; GFX942GSEL-NEXT: s_nop 15 +; GFX942GSEL-NEXT: s_nop 1 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 +; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 +; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) +; GFX942GSEL-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -1130,5 +2686,5 @@ attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" } attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} ; GFX90A: {{.*}} +; GFX942: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 0af655dfbbee9..d444db8cd1bdf 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -54,49 +54,64 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: @@ -285,49 +300,64 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: @@ -512,53 +542,69 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 @@ -592,7 +638,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -618,6 +663,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 @@ -651,7 +697,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -774,49 +819,64 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: @@ -1019,133 +1079,179 @@ exit: define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_vgpr_init: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1170,42 +1276,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1329,153 +1435,166 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GFX90A-NEXT: s_add_i32 s0, s0, -1 -; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 -; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90A-NEXT: s_endpgm -; -; GFX942-LABEL: test_mfma_loop_sgpr_init: -; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 12 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_loop_sgpr_init: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1596,60 +1715,72 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1679,9 +1810,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1707,12 +1840,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1742,9 +1872,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1835,49 +1967,64 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mfma_forward_init: @@ -2040,49 +2187,64 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 13 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 12 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_agpr_init: @@ -2319,49 +2481,64 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 10 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_nop 9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_nested_loop_zeroinit: @@ -2822,8 +2999,8 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: s_mov_b32 s4, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: s_mov_b32 s4, 16 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2871,6 +3048,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 @@ -2904,7 +3082,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2952,6 +3129,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 @@ -2985,7 +3163,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 51cd564bdece3..800eb9efa571e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -95,123 +95,123 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 -; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 -; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 -; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 -; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 -; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 -; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 -; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 -; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 -; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 -; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 -; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 -; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 -; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 -; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GREEDY908-NEXT: s_endpgm @@ -499,105 +499,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 -; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 -; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 -; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 -; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 -; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 -; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 -; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 -; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 -; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 -; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 -; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 -; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 -; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 -; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 -; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 -; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a0, s36 +; FAST90A-NEXT: v_accvgpr_write_b32 a1, s37 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, s38 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, s39 +; FAST90A-NEXT: v_accvgpr_write_b32 a4, s40 +; FAST90A-NEXT: v_accvgpr_write_b32 a5, s41 +; FAST90A-NEXT: v_accvgpr_write_b32 a6, s42 +; FAST90A-NEXT: v_accvgpr_write_b32 a7, s43 +; FAST90A-NEXT: v_accvgpr_write_b32 a8, s44 +; FAST90A-NEXT: v_accvgpr_write_b32 a9, s45 +; FAST90A-NEXT: v_accvgpr_write_b32 a10, s46 +; FAST90A-NEXT: v_accvgpr_write_b32 a11, s47 +; FAST90A-NEXT: v_accvgpr_write_b32 a12, s48 +; FAST90A-NEXT: v_accvgpr_write_b32 a13, s49 +; FAST90A-NEXT: v_accvgpr_write_b32 a14, s50 +; FAST90A-NEXT: v_accvgpr_write_b32 a15, s51 +; FAST90A-NEXT: v_accvgpr_write_b32 a16, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a17, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a18, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a19, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a20, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a21, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a22, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a23, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a24, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a25, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a26, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a27, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a28, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a29, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a30, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a31, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 -; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 -; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 -; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 -; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 -; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 -; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 -; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 -; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 -; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 -; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 -; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 -; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 -; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 -; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 -; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 -; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 -; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 -; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 -; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 -; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 -; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 -; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 -; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 -; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 -; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 -; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 -; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 -; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 -; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 -; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 -; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 -; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 -; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 -; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 -; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 -; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 -; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 -; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 -; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 -; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 -; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 -; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 -; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 -; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 -; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 -; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 -; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33 +; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34 +; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35 +; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36 +; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37 +; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38 +; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39 +; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40 +; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41 +; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42 +; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43 +; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44 +; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45 +; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46 +; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47 +; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48 +; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49 +; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50 +; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51 +; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52 +; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53 +; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54 +; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55 +; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56 +; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57 +; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58 +; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59 +; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60 +; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 15 @@ -626,82 +594,98 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 +; GREEDY908-NEXT: v_mov_b32_e32 v16, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 -; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s0 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v17, s15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY908-NEXT: s_nop 8 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] +; GREEDY908-NEXT: s_nop 9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a16 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a20 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a23 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a24 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a25 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a26 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a27 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a28 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a29 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7 +; GREEDY908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; GREEDY908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; GREEDY908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GREEDY908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: @@ -709,39 +693,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 -; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY90A-NEXT: s_nop 9 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; GREEDY90A-NEXT: s_nop 10 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a16 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a17 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a18 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a19 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a20 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a21 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a22 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a23 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a24 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a25 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a26 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a27 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a28 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-NEXT: s_nop 9 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: @@ -749,39 +745,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 -; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] -; GREEDY942-NEXT: s_nop 8 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v0, v1, a[0:15] ; GREEDY942-NEXT: s_nop 9 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a17 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a18 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a19 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a20 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a21 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a22 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a23 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a24 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a25 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a26 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a27 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a28 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY942-NEXT: s_nop 8 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: @@ -839,9 +847,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 -; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) @@ -862,8 +869,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 ; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 @@ -880,8 +887,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: s_nop 10 +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_nop 9 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0b1f884..c77042d0c96c3 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_mov_b32 s6, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -18,34 +18,33 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_and_b32 s3, s5, s4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_or_b32 s4, s6, 1 +; GFX942-NEXT: s_ashr_i32 s3, s6, 31 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX942-NEXT: s_and_b32 s6, s3, s4 +; GFX942-NEXT: s_nop 5 +; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr2 +; GFX942-NEXT: ; implicit-def: $sgpr6 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: s_mov_b32 s6, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -54,28 +53,28 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s3, 1 -; GFX908-NEXT: s_ashr_i32 s5, s3, 31 ; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: v_mov_b32_e32 v1, s2 -; GFX908-NEXT: s_nop 2 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_mov_b32_e32 v0, s2 +; GFX908-NEXT: v_mov_b32_e32 v1, s3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a1 +; GFX908-NEXT: s_or_b32 s4, s6, 1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 -; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] +; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX908-NEXT: s_ashr_i32 s3, s6, 31 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX908-NEXT: s_and_b32 s6, s3, s4 +; GFX908-NEXT: s_nop 8 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr3 -; GFX908-NEXT: ; implicit-def: $agpr2 +; GFX908-NEXT: ; implicit-def: $sgpr6 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir index 01506d0af1913..29f44282f06fc 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -83,13 +83,12 @@ body: | ; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; COALESCE-NEXT: {{ $}} - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -102,12 +101,12 @@ body: | ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] - ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 - ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 - ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]] + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1 + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1 + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: ; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) @@ -137,13 +136,12 @@ body: | ; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -156,12 +154,12 @@ body: | ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] - ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 - ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 - ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]] + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1 + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1 + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: ; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir index a9207de317ea1..17458fa8b08a7 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir @@ -74,7 +74,7 @@ body: | ; COALESCE-NEXT: successors: %bb.3(0x80000000) ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec @@ -116,7 +116,7 @@ body: | ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0 ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index f4a9e7e8f2759..110604a7cd88e 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -521,8 +521,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 -; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: v_mov_b32_e32 v31, v32 +; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index fc154604b8700..4e6b9166b3ed0 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -27,7 +27,6 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] -; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: s_branch .LBB0_4 ; CHECK-NEXT: .LBB0_2: @@ -47,7 +46,6 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31] -; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; CHECK-NEXT: .LBB0_4: ; %endif ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:31] diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index b9e9893ede4e2..ecada6b300aa1 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -378,73 +378,66 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7] -; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7] -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7] -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; CHECK-NEXT: v_mov_b32_e32 v5, v4 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3] -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a2 -; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19] -; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0 -; CHECK-NEXT: global_store_short v[20:21], v23, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] +; CHECK-NEXT: global_store_short v[12:13], v17, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7] -; CHECK-NEXT: global_store_short v[20:21], v15, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v9, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v14, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16 +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 +; CHECK-NEXT: global_store_short v[12:13], v1, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v14, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: global_store_short v[12:13], v14, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v12, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: s_nop 6 +; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v8, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v0, off +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: global_store_short v[12:13], v0, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -819,32 +812,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_ ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16 -; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15 -; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14 -; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13 -; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12 -; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11 -; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10 -; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9 -; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8 -; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7 -; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6 -; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5 -; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4 -; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3 -; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2 -; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a4, a5 +; CHECK-NEXT: v_accvgpr_mov_b32 a5, a6 +; CHECK-NEXT: v_accvgpr_mov_b32 a6, a7 +; CHECK-NEXT: v_accvgpr_mov_b32 a7, a8 +; CHECK-NEXT: v_accvgpr_mov_b32 a8, a9 +; CHECK-NEXT: v_accvgpr_mov_b32 a9, a10 +; CHECK-NEXT: v_accvgpr_mov_b32 a10, a11 +; CHECK-NEXT: v_accvgpr_mov_b32 a11, a12 +; CHECK-NEXT: v_accvgpr_mov_b32 a12, a13 +; CHECK-NEXT: v_accvgpr_mov_b32 a13, a14 +; CHECK-NEXT: v_accvgpr_mov_b32 a14, a15 +; CHECK-NEXT: v_accvgpr_mov_b32 a15, a16 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index 4d864ad15b411..3ee558d6f8a9e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -576,9 +576,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a4, a5 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a4 -; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a5 -; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[16:17] +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a5 +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -590,9 +590,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX940-NEXT: ; def a4, a5 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_accvgpr_mov_b32 a1, a4 -; GFX940-NEXT: v_accvgpr_mov_b32 a0, a5 -; GFX940-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GFX940-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX940-NEXT: v_accvgpr_read_b32 v2, a5 +; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..50cdf11eea2f7 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -413,25 +413,27 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -672,25 +674,27 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..a6a84c780cb32 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -413,25 +413,27 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -672,25 +674,27 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll index 51dc9a51ec9d0..0b20caea9cd95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -474,27 +478,31 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll index 7f8f2dbbb09a1..2ecbf9622a259 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -474,27 +478,31 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..bacec04ab7600 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -413,25 +413,27 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -672,25 +674,27 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..fb71492fb867d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -469,27 +469,29 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -561,26 +563,27 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -651,27 +654,29 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -696,26 +701,27 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -744,32 +750,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -799,30 +808,33 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -852,32 +864,35 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -907,33 +922,35 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1245,34 +1262,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1302,34 +1320,35 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1459,27 +1478,29 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1506,25 +1527,27 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1553,34 +1576,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1856,26 +1880,27 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1905,34 +1930,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2013,28 +2039,29 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..1ab87d6f19ec4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -416,25 +416,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -959,28 +961,29 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1007,27 +1010,29 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1395,13 +1400,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1411,13 +1417,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2008,13 +2015,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2024,13 +2032,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2859,28 +2868,29 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2908,27 +2918,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3007,13 +3019,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,13 +3036,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3167,27 +3181,29 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3610,13 +3626,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3626,13 +3643,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3773,27 +3791,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll index 50c69de069986..c5a08f098b4c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..91790ab5ff97f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -469,27 +469,29 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -561,26 +563,27 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -651,27 +654,29 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -696,26 +701,27 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -744,32 +750,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -799,30 +808,33 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -852,32 +864,35 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -907,33 +922,35 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1245,34 +1262,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1302,34 +1320,35 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1459,27 +1478,29 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1506,25 +1527,27 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1553,34 +1576,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1856,26 +1880,27 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1905,34 +1930,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2013,28 +2039,29 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..db780ced25148 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -416,25 +416,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -959,28 +961,29 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1007,27 +1010,29 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1395,13 +1400,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1411,13 +1417,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2008,13 +2015,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2024,13 +2032,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2859,28 +2868,29 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2908,27 +2918,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3007,13 +3019,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,13 +3036,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3167,27 +3181,29 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3610,13 +3626,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3626,13 +3643,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3773,27 +3791,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll index 11d1897d0449f..92d6c95c26599 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll index a15fc3212f474..bbca5039bb02c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -695,28 +699,32 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1555,28 +1563,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2148,28 +2160,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll index fe132493ce536..8757639c501d2 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -695,28 +699,32 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1555,28 +1563,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2148,28 +2160,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..6d294b58ceeec 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -469,27 +469,29 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -561,26 +563,27 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -651,27 +654,29 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -696,26 +701,27 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -744,32 +750,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -799,30 +808,33 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -852,32 +864,35 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -907,33 +922,35 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1245,34 +1262,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1302,34 +1320,35 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1459,27 +1478,29 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1506,25 +1527,27 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1553,34 +1576,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1856,26 +1880,27 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1905,34 +1930,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2013,28 +2039,29 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..88d43df5938ee 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -416,25 +416,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -959,28 +961,29 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1007,27 +1010,29 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1395,13 +1400,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1411,13 +1417,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2008,13 +2015,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2024,13 +2032,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2859,28 +2868,29 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2908,27 +2918,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3007,13 +3019,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,13 +3036,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3167,27 +3181,29 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3610,13 +3626,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3626,13 +3643,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3773,27 +3791,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll index 834f03f013ba1..c9f194d873e35 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index df148f299a165..c7092f04a23ed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2380,28 +2380,29 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index d4ee6fa20cad8..1224ab2b381c9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1664,17 +1674,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1682,17 +1692,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3584,17 +3607,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3602,17 +3625,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3871,16 +3894,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3911,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3929,16 +3952,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3946,16 +3970,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4045,36 +4070,37 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4358,31 +4384,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4410,29 +4436,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4463,17 +4487,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4481,17 +4505,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4580,36 +4604,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4637,31 +4660,31 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4689,31 +4712,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4741,28 +4764,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4793,16 +4817,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4835,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5376,28 +5401,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5428,17 +5452,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5446,17 +5470,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5543,36 +5567,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5599,28 +5622,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5648,28 +5674,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5697,27 +5726,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5749,16 +5780,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5766,17 +5798,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5808,17 +5840,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5826,17 +5858,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5922,27 +5954,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6508,30 +6544,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6604,28 +6641,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6657,16 +6695,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6674,17 +6713,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6830,28 +6869,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index edc540edb3ad1..d5bd41397c4f0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 9d3affa6da266..03503c9dac197 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2386,28 +2386,29 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 1a669adf2b635..0222f73fbd193 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1664,17 +1674,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1682,17 +1692,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3584,17 +3607,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3602,17 +3625,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3871,16 +3894,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3911,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3929,16 +3952,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3946,16 +3970,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4045,36 +4070,37 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4358,31 +4384,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4410,29 +4436,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4463,17 +4487,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4481,17 +4505,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4580,36 +4604,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4637,31 +4660,31 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4689,31 +4712,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4741,28 +4764,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4793,16 +4817,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4835,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5376,28 +5401,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5428,17 +5452,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5446,17 +5470,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5543,36 +5567,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5599,28 +5622,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5648,28 +5674,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5697,27 +5726,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5749,16 +5780,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5766,17 +5798,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5808,17 +5840,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5826,17 +5858,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5922,27 +5954,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6508,30 +6544,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6604,28 +6641,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6657,16 +6695,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6674,17 +6713,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6830,28 +6869,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index 983afa566e2c1..ee2f94b90ffa9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll index ac7d9557ce765..21ec9acf6317d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -624,15 +628,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -642,18 +646,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -752,15 +756,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -770,15 +774,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -953,33 +957,39 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1206,18 +1216,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,18 +1237,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1556,15 +1566,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,18 +1584,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1663,33 +1673,33 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1961,17 +1971,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1984,17 +1994,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2447,33 +2457,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2508,15 +2524,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2526,15 +2542,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2577,17 +2593,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2597,21 +2613,21 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2649,18 +2665,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2670,19 +2686,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2717,15 +2733,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,15 +2751,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2776,13 +2792,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2808,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3134,33 +3150,33 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3374,39 +3390,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index 8dd4a40d00680..615b382aa355a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1388,13 +1388,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,13 +1406,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3637,33 +3641,33 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4787,13 +4791,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,13 +4809,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5169,15 +5177,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5187,15 +5195,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5532,15 +5540,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,15 +5558,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6255,17 +6263,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,17 +6283,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6970,33 +6978,33 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7344,15 +7352,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7362,15 +7370,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index ea9ef2f1ac94a..32f6e00716e37 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll index b30af835a7882..ee3b303f88471 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -624,15 +628,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -642,18 +646,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -752,15 +756,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -770,15 +774,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -953,33 +957,39 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1206,18 +1216,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,18 +1237,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1556,15 +1566,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,18 +1584,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1663,33 +1673,33 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1961,17 +1971,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1984,17 +1994,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2447,33 +2457,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2508,15 +2524,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2526,15 +2542,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2577,17 +2593,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2597,21 +2613,21 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2649,18 +2665,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2670,19 +2686,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2717,15 +2733,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,15 +2751,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2776,13 +2792,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2808,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3134,33 +3150,33 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3374,39 +3390,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index e6ac554735eee..09e497259766e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1388,13 +1388,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,13 +1406,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3637,33 +3641,33 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4787,13 +4791,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,13 +4809,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5169,15 +5177,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5187,15 +5195,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5532,15 +5540,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,15 +5558,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6255,17 +6263,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,17 +6283,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6970,33 +6978,33 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7344,15 +7352,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7362,15 +7370,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index ce1c54129f706..257af574366a6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 3b5690562c38a..90a1b99dc7c14 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2386,28 +2386,29 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index 8039e126590b9..d13d26f638e0c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1664,17 +1674,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1682,17 +1692,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3584,17 +3607,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3602,17 +3625,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3871,16 +3894,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3911,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3929,16 +3952,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3946,16 +3970,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4045,36 +4070,37 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4358,31 +4384,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4410,29 +4436,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4463,17 +4487,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4481,17 +4505,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4580,36 +4604,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4637,31 +4660,31 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4689,31 +4712,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4741,28 +4764,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4793,16 +4817,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4835,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5376,28 +5401,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5428,17 +5452,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5446,17 +5470,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5543,36 +5567,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5599,28 +5622,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5648,28 +5674,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5697,27 +5726,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5749,16 +5780,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5766,17 +5798,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5808,17 +5840,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5826,17 +5858,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5922,27 +5954,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6508,30 +6544,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6604,28 +6641,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6657,16 +6695,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6674,17 +6713,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6830,28 +6869,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index eeab42ae40d7f..1684b94cfd452 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index d2008be4fd32a..96b18593ea655 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -16,19 +16,18 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base -; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s68, -1 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s5, s9, 0 -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_cselect_b32 s6, s68, 0 +; CHECK-NEXT: s_add_u32 s50, s34, 48 ; CHECK-NEXT: v_mov_b32_e32 v57, s5 ; CHECK-NEXT: s_mov_b32 s5, s4 -; CHECK-NEXT: s_add_u32 s50, s34, 48 -; CHECK-NEXT: v_accvgpr_write_b32 a33, s5 ; CHECK-NEXT: s_addc_u32 s51, s35, 0 -; CHECK-NEXT: v_accvgpr_write_b32 a32, s4 +; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 @@ -48,13 +47,13 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s52, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: v_mov_b32_e32 v62, s66 -; CHECK-NEXT: v_mov_b32_e32 v63, s67 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] +; CHECK-NEXT: v_mov_b32_e32 v60, s66 +; CHECK-NEXT: v_mov_b32_e32 v61, s67 +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59] +; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[58:59] ; CHECK-NEXT: v_mov_b32_e32 v44, 0 ; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -66,7 +65,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45] -; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] @@ -75,9 +74,9 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: v_mov_b32_e32 v1, s67 ; CHECK-NEXT: v_mov_b32_e32 v0, s68 ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b045c761436de..644705e173b52 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-LABEL: v8i8_multi_block: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 +; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[6:7], v2, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_4 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3 +; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_3 ; GFX942-NEXT: ; %bb.2: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: global_store_dwordx2 v1, v[6:7], s[12:13] ; GFX942-NEXT: .LBB11_3: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: .LBB11_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() From f797b9bb2b74c6f090067b4495cd10813709adbe Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 8 Oct 2025 15:36:16 -0700 Subject: [PATCH 3/5] Revert "Update lit tests" This reverts commit 6d5273761c2659ecaf8f453f8c9def032aed145e. --- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 144 +- llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll | 406 +- .../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 81 +- .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 3680 +++--- .../AMDGPU/a-v-global-atomic-cmpxchg.ll | 28 +- .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 1070 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 12 +- llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 680 +- llvm/test/CodeGen/AMDGPU/agpr-remat.ll | 16 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 30 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 30 +- .../AMDGPU/buffer-fat-pointers-memcpy.ll | 307 +- llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 422 +- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 4 + .../CodeGen/AMDGPU/global-i16-load-store.ll | 12 +- .../AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 7 +- .../AMDGPU/lds-dma-workgroup-release.ll | 24 +- .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 8 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 12 +- .../llvm.amdgcn.image.atomic.dim.gfx90a.ll | 34 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll | 96 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 80 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 476 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 146 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1236 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll | 41 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 1420 +- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3668 ++---- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 10777 ++++++---------- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 2446 ++-- ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 + ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 16 +- ...uffer-fat-pointers-nontemporal-metadata.ll | 24 +- llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 2000 +-- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 1321 +- .../AMDGPU/mfma-no-register-aliasing.ll | 604 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 51 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 26 +- .../CodeGen/AMDGPU/no-fold-accvgpr-read.mir | 4 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 2 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 2 + .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 119 +- .../AMDGPU/shufflevector-physreg-copy.ll | 12 +- .../AMDGPU/shufflevector.v2f32.v3f32.ll | 28 +- .../AMDGPU/shufflevector.v2i32.v3i32.ll | 28 +- .../AMDGPU/shufflevector.v2i64.v2i64.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 28 +- .../AMDGPU/shufflevector.v3f32.v2f32.ll | 391 +- .../AMDGPU/shufflevector.v3f32.v3f32.ll | 200 +- .../AMDGPU/shufflevector.v3f32.v4f32.ll | 377 +- .../AMDGPU/shufflevector.v3i32.v2i32.ll | 391 +- .../AMDGPU/shufflevector.v3i32.v3i32.ll | 200 +- .../AMDGPU/shufflevector.v3i32.v4i32.ll | 377 +- .../AMDGPU/shufflevector.v3i64.v2i64.ll | 92 +- .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 92 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 391 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 200 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll | 377 +- .../AMDGPU/shufflevector.v4f32.v2f32.ll | 45 +- .../AMDGPU/shufflevector.v4f32.v3f32.ll | 1474 +-- .../AMDGPU/shufflevector.v4f32.v4f32.ll | 518 +- .../AMDGPU/shufflevector.v4i32.v2i32.ll | 45 +- .../AMDGPU/shufflevector.v4i32.v3i32.ll | 1474 +-- .../AMDGPU/shufflevector.v4i32.v4i32.ll | 518 +- .../AMDGPU/shufflevector.v4i64.v2i64.ll | 364 +- .../AMDGPU/shufflevector.v4i64.v3i64.ll | 204 +- .../AMDGPU/shufflevector.v4i64.v4i64.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 364 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 204 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 40 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 45 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 1474 +-- .../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 518 +- .../AMDGPU/undef-handling-crash-in-ra.ll | 23 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 22 +- 78 files changed, 18502 insertions(+), 23754 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 9f1955c78eb36..7e297f46a780e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -23,9 +23,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_add_noret_f64: @@ -34,9 +34,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64: @@ -142,9 +142,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64: @@ -153,9 +153,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64: @@ -261,9 +261,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_add_noret_f64: @@ -272,9 +272,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64: @@ -379,9 +379,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64: @@ -390,9 +390,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64: @@ -497,9 +497,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_min_noret_f64: @@ -508,9 +508,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64: @@ -616,9 +616,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64: @@ -627,9 +627,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64: @@ -735,9 +735,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_min_noret_f64: @@ -746,9 +746,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64: @@ -853,9 +853,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64: @@ -864,9 +864,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64: @@ -971,9 +971,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_max_noret_f64: @@ -982,9 +982,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64: @@ -1090,9 +1090,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64: @@ -1101,9 +1101,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64: @@ -1209,9 +1209,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_max_noret_f64: @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64: @@ -1327,9 +1327,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64: @@ -1338,9 +1338,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX942-NEXT: s_endpgm ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64: diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll index 2968e0441d349..4c62409a85c00 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll @@ -183,125 +183,122 @@ define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av_no_agprs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a33, v31 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a32, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v29 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v28 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v27 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v26 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v25 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v23 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v22 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v21 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v20 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v19 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v18 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v17 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v16 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v15 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v14 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v13 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v12 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v11 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v6 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v5 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a32 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a33 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10 %data = call i32 asm "; def $0", "=^VA"() @@ -747,125 +744,122 @@ define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av_no_agprs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a33, v31 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a32, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v29 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v28 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v27 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v26 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v25 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v23 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v22 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v21 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v20 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v19 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v18 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v17 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v16 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v15 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v14 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v13 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v12 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v11 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v6 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v5 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a32 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a33 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10 %data = call i32 asm "; def $0", "=^VA"() diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index e9192ca2d03ac..bc341f2baa804 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -472,46 +472,49 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB14_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB14_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB14_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; CHECK-NEXT: .LBB14_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -530,50 +533,53 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB15_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -768,46 +774,49 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_v__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB19_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB19_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB19_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; CHECK-NEXT: .LBB19_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:1] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 4a8225fcd6ad2..d053425afbb6d 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -338,264 +338,225 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] -; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -679,43 +640,43 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB11_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB11_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB11_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -727,39 +688,39 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ; def a[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB11_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB11_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off +; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off ; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -871,41 +832,41 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB13_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB13_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB13_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -917,37 +878,37 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB13_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB13_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -965,40 +926,41 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB14_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1054,40 +1016,41 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB15_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1143,40 +1106,41 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB16_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB16_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB16_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -1188,37 +1152,37 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB16_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB16_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB16_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(1) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -1420,10 +1384,12 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 @@ -1440,13 +1406,14 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1516,6 +1483,7 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB20_2 ; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private @@ -1592,12 +1560,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1622,12 +1590,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1727,12 +1695,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1756,12 +1724,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1924,12 +1892,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1953,12 +1921,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -2526,7 +2494,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB32_4 @@ -2544,7 +2512,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_2 @@ -2558,18 +2528,18 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2586,7 +2556,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB32_4 @@ -2603,7 +2573,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB32_2 @@ -2618,15 +2590,15 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2770,7 +2742,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB34_4 @@ -2788,7 +2760,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_2 @@ -2802,18 +2776,18 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2828,7 +2802,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB34_4 @@ -2845,7 +2819,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB34_2 @@ -2860,15 +2836,15 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -3125,7 +3101,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB37_4 @@ -3143,7 +3119,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_2 @@ -3157,18 +3135,18 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -3183,7 +3161,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB37_4 @@ -3200,7 +3178,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB37_2 @@ -3215,15 +3195,15 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4028,262 +4008,223 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] -; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -4367,37 +4308,39 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB53_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB53_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB53_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4412,35 +4355,37 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB53_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB53_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4552,36 +4497,38 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB55_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB55_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB55_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4594,35 +4541,37 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB55_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB55_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB55_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -4817,36 +4766,38 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB58_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB58_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB58_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -4859,35 +4810,37 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 -; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB58_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB58_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 -; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5487,13 +5440,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -5516,12 +5469,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6109,13 +6062,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6140,13 +6093,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6240,12 +6193,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6268,12 +6221,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6352,45 +6305,48 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB89_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB89_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB89_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_add_i64_ret_a_a: @@ -6398,41 +6354,43 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB89_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB89_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB89_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off -; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi -; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6530,45 +6488,48 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB91_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB91_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB91_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a: @@ -6576,43 +6537,45 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB91_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB91_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB91_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off -; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi -; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6712,45 +6675,48 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB93_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB93_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB93_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_and_i64_ret_a_a: @@ -6758,42 +6724,44 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB93_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB93_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB93_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v3, v1, v5 -; GFX950-NEXT: v_and_b32_e32 v2, v0, v4 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off -; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi -; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -6901,7 +6869,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB95_4 @@ -6918,6 +6886,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -6931,21 +6901,21 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB95_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v0, v6 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v3 +; GFX90A-NEXT: v_not_b32_e32 v3, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6964,7 +6934,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB95_4 @@ -6981,6 +6951,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -7000,13 +6972,13 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v7 ; GFX950-NEXT: v_and_b32_e32 v5, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7146,45 +7118,48 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB97_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB97_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB97_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v3, v1, v5 -; GFX90A-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_or_i64_ret_a_a: @@ -7192,42 +7167,44 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB97_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB97_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB97_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_or_b32_e32 v3, v1, v5 -; GFX950-NEXT: v_or_b32_e32 v2, v0, v4 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off -; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi -; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7332,40 +7309,43 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB99_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB99_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB99_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_max_i64_ret_a_a: @@ -7373,44 +7353,46 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB99_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB99_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB99_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7518,40 +7500,43 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB101_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB101_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB101_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_min_i64_ret_a_a: @@ -7559,44 +7544,46 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB101_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB101_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB101_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7704,40 +7691,43 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB103_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB103_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB103_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a: @@ -7745,44 +7735,46 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB103_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB103_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB103_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -7890,40 +7882,43 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB105_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB105_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB105_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a: @@ -7931,44 +7926,46 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB105_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB105_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB105_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8076,42 +8073,45 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB107_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB107_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB107_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a: @@ -8119,45 +8119,46 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB107_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB107_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB107_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 -; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8262,50 +8263,53 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB109_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB109_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB109_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: @@ -8313,46 +8317,48 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB109_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB109_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB109_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -8460,62 +8466,64 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB111_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[6:7] +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB111_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB111_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[6:7] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -8534,7 +8542,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB111_4 @@ -8554,6 +8562,8 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -8575,14 +8585,13 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -8739,7 +8748,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB113_4 @@ -8757,6 +8766,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -8778,14 +8789,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -8804,7 +8815,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB113_4 @@ -8824,6 +8835,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -8845,14 +8858,13 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -9010,53 +9022,55 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: ; implicit-def: $agpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB115_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: ; implicit-def: $agpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: .LBB115_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB115_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB115_6: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB115_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3 +; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a: @@ -9175,12 +9189,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9203,12 +9217,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9298,13 +9312,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9328,13 +9342,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9428,13 +9442,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9458,13 +9472,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9559,13 +9573,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9588,12 +9602,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9687,13 +9701,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9716,12 +9730,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9803,63 +9817,68 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB127_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB127_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB127_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB127_6: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB127_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: @@ -9867,61 +9886,65 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB127_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB127_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB127_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB127_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB127_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi -; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10066,7 +10089,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB129_4 @@ -10080,7 +10103,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 @@ -10096,14 +10121,15 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10122,7 +10148,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB129_4 @@ -10136,7 +10162,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB129_2 @@ -10153,12 +10181,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -10285,46 +10313,49 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB131_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB131_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB131_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a: @@ -10332,43 +10363,45 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB131_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB131_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB131_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10469,46 +10502,49 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB133_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB133_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB133_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a: @@ -10516,43 +10552,45 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB133_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB133_2: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB133_4 ; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX950-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10662,7 +10700,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB135_4 @@ -10680,6 +10718,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -10697,17 +10737,18 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10726,7 +10767,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB135_4 @@ -10745,6 +10786,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -10765,14 +10808,13 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -10926,7 +10968,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB137_4 @@ -10944,6 +10986,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -10961,17 +11005,18 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10990,7 +11035,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB137_4 @@ -11009,6 +11054,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -11029,14 +11076,13 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11199,12 +11245,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB139_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11296,12 +11342,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11324,12 +11370,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11419,13 +11465,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11450,13 +11496,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11551,13 +11597,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11582,13 +11628,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11688,13 +11734,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11717,12 +11763,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11824,13 +11870,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11853,12 +11899,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11975,13 +12021,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12108,13 +12154,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12142,13 +12188,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12278,13 +12324,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12312,13 +12358,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12448,13 +12494,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12482,13 +12528,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12623,13 +12669,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12657,13 +12703,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12803,13 +12849,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12837,13 +12883,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13282,13 +13328,13 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13312,12 +13358,12 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14043,13 +14089,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14075,13 +14121,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14178,12 +14224,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14207,12 +14253,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14309,26 +14355,28 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB193_3 ; GFX90A-NEXT: s_branch .LBB193_4 ; GFX90A-NEXT: .LBB193_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a: @@ -14350,23 +14398,25 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX950-NEXT: s_cbranch_execz .LBB193_3 ; GFX950-NEXT: s_branch .LBB193_4 ; GFX950-NEXT: .LBB193_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 ; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0 ; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_waitcnt vmcnt(1) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ; use a[2:3] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14473,28 +14523,32 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB195_3 ; GFX90A-NEXT: s_branch .LBB195_4 ; GFX90A-NEXT: .LBB195_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a: @@ -14516,24 +14570,27 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB195_3 ; GFX950-NEXT: s_branch .LBB195_4 ; GFX950-NEXT: .LBB195_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14643,28 +14700,32 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB197_3 ; GFX90A-NEXT: s_branch .LBB197_4 ; GFX90A-NEXT: .LBB197_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a: @@ -14678,34 +14739,37 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB197_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB197_3 ; GFX950-NEXT: s_branch .LBB197_4 ; GFX950-NEXT: .LBB197_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14817,28 +14881,32 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB199_3 ; GFX90A-NEXT: s_branch .LBB199_4 ; GFX90A-NEXT: .LBB199_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a: @@ -14852,33 +14920,36 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB199_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB199_3 ; GFX950-NEXT: s_branch .LBB199_4 ; GFX950-NEXT: .LBB199_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -14999,6 +15070,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -15007,25 +15080,25 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB201_6 ; GFX90A-NEXT: .LBB201_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB201_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v3 +; GFX90A-NEXT: v_not_b32_e32 v3, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -15061,6 +15134,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -15069,7 +15144,7 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB201_6 ; GFX950-NEXT: .LBB201_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB201_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -15078,12 +15153,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 ; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -15235,28 +15310,32 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB203_3 ; GFX90A-NEXT: s_branch .LBB203_4 ; GFX90A-NEXT: .LBB203_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a: @@ -15270,33 +15349,36 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB203_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB203_3 ; GFX950-NEXT: s_branch .LBB203_4 ; GFX950-NEXT: .LBB203_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15407,28 +15489,32 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB205_3 ; GFX90A-NEXT: s_branch .LBB205_4 ; GFX90A-NEXT: .LBB205_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a: @@ -15442,33 +15528,36 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB205_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB205_3 ; GFX950-NEXT: s_branch .LBB205_4 ; GFX950-NEXT: .LBB205_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15579,29 +15668,33 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB207_3 ; GFX90A-NEXT: s_branch .LBB207_4 ; GFX90A-NEXT: .LBB207_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a: @@ -15615,35 +15708,38 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB207_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB207_3 ; GFX950-NEXT: s_branch .LBB207_4 ; GFX950-NEXT: .LBB207_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15757,29 +15853,33 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB209_3 ; GFX90A-NEXT: s_branch .LBB209_4 ; GFX90A-NEXT: .LBB209_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a: @@ -15793,35 +15893,38 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB209_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB209_3 ; GFX950-NEXT: s_branch .LBB209_4 ; GFX950-NEXT: .LBB209_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -15935,29 +16038,33 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB211_3 ; GFX90A-NEXT: s_branch .LBB211_4 ; GFX90A-NEXT: .LBB211_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a: @@ -15971,35 +16078,38 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB211_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB211_3 ; GFX950-NEXT: s_branch .LBB211_4 ; GFX950-NEXT: .LBB211_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16113,29 +16223,33 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB213_3 ; GFX90A-NEXT: s_branch .LBB213_4 ; GFX90A-NEXT: .LBB213_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a: @@ -16149,35 +16263,38 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB213_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB213_3 ; GFX950-NEXT: s_branch .LBB213_4 ; GFX950-NEXT: .LBB213_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 -; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16283,39 +16400,43 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB215_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB215_3 ; GFX90A-NEXT: s_branch .LBB215_4 ; GFX90A-NEXT: .LBB215_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a: @@ -16329,36 +16450,38 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB215_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB215_3 ; GFX950-NEXT: s_branch .LBB215_4 ; GFX950-NEXT: .LBB215_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 -; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16475,33 +16598,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB217_3 ; GFX90A-NEXT: s_branch .LBB217_4 ; GFX90A-NEXT: .LBB217_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a: @@ -16515,37 +16642,40 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB217_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB217_3 ; GFX950-NEXT: s_branch .LBB217_4 ; GFX950-NEXT: .LBB217_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 ; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() @@ -16676,6 +16806,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -16684,7 +16816,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB219_6 ; GFX90A-NEXT: .LBB219_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB219_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -16697,13 +16829,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc ; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -16742,6 +16874,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -16750,7 +16884,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB219_6 ; GFX950-NEXT: .LBB219_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB219_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -16761,13 +16895,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -16938,6 +17071,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -16946,7 +17081,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB221_6 ; GFX90A-NEXT: .LBB221_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB221_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -16959,13 +17094,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -17004,6 +17139,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -17012,7 +17149,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB221_6 ; GFX950-NEXT: .LBB221_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB221_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -17023,13 +17160,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -17197,36 +17333,38 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_cbranch_execz .LBB223_5 ; GFX90A-NEXT: s_branch .LBB223_6 ; GFX90A-NEXT: .LBB223_3: -; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0 ; GFX90A-NEXT: s_branch .LBB223_7 ; GFX90A-NEXT: .LBB223_4: -; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0 ; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GFX90A-NEXT: .LBB223_6: ; %Flow1 ; GFX90A-NEXT: s_cbranch_execnz .LBB223_8 ; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0 +; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: @@ -17347,12 +17485,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17376,12 +17514,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17459,29 +17597,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17491,29 +17629,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17597,29 +17735,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17629,29 +17767,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17735,30 +17873,29 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17782,12 +17919,12 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17868,30 +18005,29 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17915,12 +18051,12 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18025,21 +18161,27 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB235_5 ; GFX90A-NEXT: s_branch .LBB235_6 ; GFX90A-NEXT: .LBB235_3: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_branch .LBB235_7 ; GFX90A-NEXT: .LBB235_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB235_6: ; %Flow1 @@ -18047,17 +18189,16 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: @@ -18084,36 +18225,40 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB235_5 ; GFX950-NEXT: s_branch .LBB235_6 ; GFX950-NEXT: .LBB235_3: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_branch .LBB235_7 ; GFX950-NEXT: .LBB235_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-NEXT: .LBB235_6: ; %Flow1 ; GFX950-NEXT: s_cbranch_execnz .LBB235_8 ; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v2, s0 -; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18263,7 +18408,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 @@ -18271,7 +18418,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB237_6 ; GFX90A-NEXT: .LBB237_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB237_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -18279,13 +18426,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -18318,7 +18466,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_2 @@ -18326,18 +18476,18 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB237_6 ; GFX950-NEXT: .LBB237_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB237_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -18468,37 +18618,41 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB239_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB239_3 ; GFX90A-NEXT: s_branch .LBB239_4 ; GFX90A-NEXT: .LBB239_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a: @@ -18520,10 +18674,13 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB239_3 ; GFX950-NEXT: s_branch .LBB239_4 ; GFX950-NEXT: .LBB239_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 @@ -18531,15 +18688,15 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18644,37 +18801,41 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB241_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB241_3 ; GFX90A-NEXT: s_branch .LBB241_4 ; GFX90A-NEXT: .LBB241_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a: @@ -18696,10 +18857,13 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB241_3 ; GFX950-NEXT: s_branch .LBB241_4 ; GFX950-NEXT: .LBB241_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 @@ -18707,15 +18871,15 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18839,6 +19003,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -18847,7 +19013,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB243_6 ; GFX90A-NEXT: .LBB243_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB243_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -18856,16 +19022,17 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -18903,6 +19070,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -18911,7 +19080,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB243_6 ; GFX950-NEXT: .LBB243_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB243_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -18921,13 +19090,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -19095,6 +19263,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] @@ -19103,7 +19273,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB245_6 ; GFX90A-NEXT: .LBB245_4: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB245_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -19112,16 +19282,17 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -19159,6 +19330,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] @@ -19167,7 +19340,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB245_6 ; GFX950-NEXT: .LBB245_4: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_cbranch_execz .LBB245_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -19177,13 +19350,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -19343,12 +19515,12 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB247_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19446,12 +19618,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19475,12 +19647,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19558,29 +19730,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19590,30 +19762,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19698,29 +19870,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19730,30 +19902,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19859,13 +20031,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19889,12 +20061,12 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20000,13 +20172,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20030,12 +20202,12 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20156,13 +20328,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20297,13 +20469,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20333,13 +20505,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20475,13 +20647,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20511,13 +20683,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20653,13 +20825,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20689,13 +20861,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20836,13 +21008,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20872,13 +21044,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -21024,13 +21196,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -21060,13 +21232,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll index 37a44d8b4b7d1..063feec759efa 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll @@ -449,13 +449,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[2:3] +; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -483,13 +483,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[2:3] +; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -515,8 +515,8 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -545,8 +545,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND @@ -661,8 +661,8 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -691,8 +691,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index c54421ae64528..c98fff96d7b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -338,264 +338,225 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] -; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -1101,12 +1062,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1131,12 +1092,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1236,12 +1197,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1265,12 +1226,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1433,12 +1394,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1462,12 +1423,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -2046,14 +2007,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2079,14 +2040,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB32_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2190,14 +2151,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2221,14 +2182,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB34_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2395,14 +2356,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2426,14 +2387,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB37_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -3025,262 +2986,223 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:31] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_accvgpr_write_b32 a33, v31 -; GFX950-NEXT: v_accvgpr_write_b32 a32, v30 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v29 -; GFX950-NEXT: v_accvgpr_write_b32 a30, v28 -; GFX950-NEXT: v_accvgpr_write_b32 a29, v27 -; GFX950-NEXT: v_accvgpr_write_b32 a28, v26 -; GFX950-NEXT: v_accvgpr_write_b32 a27, v25 -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 -; GFX950-NEXT: v_accvgpr_write_b32 a25, v23 -; GFX950-NEXT: v_accvgpr_write_b32 a24, v22 -; GFX950-NEXT: v_accvgpr_write_b32 a23, v21 -; GFX950-NEXT: v_accvgpr_write_b32 a22, v20 -; GFX950-NEXT: v_accvgpr_write_b32 a21, v19 -; GFX950-NEXT: v_accvgpr_write_b32 a20, v18 -; GFX950-NEXT: v_accvgpr_write_b32 a19, v17 -; GFX950-NEXT: v_accvgpr_write_b32 a18, v16 -; GFX950-NEXT: v_accvgpr_write_b32 a17, v15 -; GFX950-NEXT: v_accvgpr_write_b32 a16, v14 -; GFX950-NEXT: v_accvgpr_write_b32 a15, v13 -; GFX950-NEXT: v_accvgpr_write_b32 a14, v12 -; GFX950-NEXT: v_accvgpr_write_b32 a13, v11 -; GFX950-NEXT: v_accvgpr_write_b32 a12, v10 -; GFX950-NEXT: v_accvgpr_write_b32 a11, v9 -; GFX950-NEXT: v_accvgpr_write_b32 a10, v8 -; GFX950-NEXT: v_accvgpr_write_b32 a9, v7 -; GFX950-NEXT: v_accvgpr_write_b32 a8, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a7, v5 -; GFX950-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX950-NEXT: v_accvgpr_read_b32 v1, a3 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX950-NEXT: v_accvgpr_read_b32 v6, a8 -; GFX950-NEXT: v_accvgpr_read_b32 v7, a9 -; GFX950-NEXT: v_accvgpr_read_b32 v8, a10 -; GFX950-NEXT: v_accvgpr_read_b32 v9, a11 -; GFX950-NEXT: v_accvgpr_read_b32 v10, a12 -; GFX950-NEXT: v_accvgpr_read_b32 v11, a13 -; GFX950-NEXT: v_accvgpr_read_b32 v12, a14 -; GFX950-NEXT: v_accvgpr_read_b32 v13, a15 -; GFX950-NEXT: v_accvgpr_read_b32 v14, a16 -; GFX950-NEXT: v_accvgpr_read_b32 v15, a17 -; GFX950-NEXT: v_accvgpr_read_b32 v16, a18 -; GFX950-NEXT: v_accvgpr_read_b32 v17, a19 -; GFX950-NEXT: v_accvgpr_read_b32 v18, a20 -; GFX950-NEXT: v_accvgpr_read_b32 v19, a21 -; GFX950-NEXT: v_accvgpr_read_b32 v20, a22 -; GFX950-NEXT: v_accvgpr_read_b32 v21, a23 -; GFX950-NEXT: v_accvgpr_read_b32 v22, a24 -; GFX950-NEXT: v_accvgpr_read_b32 v23, a25 -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 -; GFX950-NEXT: v_accvgpr_read_b32 v25, a27 -; GFX950-NEXT: v_accvgpr_read_b32 v26, a28 -; GFX950-NEXT: v_accvgpr_read_b32 v27, a29 -; GFX950-NEXT: v_accvgpr_read_b32 v28, a30 -; GFX950-NEXT: v_accvgpr_read_b32 v29, a31 -; GFX950-NEXT: v_accvgpr_read_b32 v30, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v31, a33 -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:31] -; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() @@ -3971,13 +3893,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4000,12 +3922,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4593,13 +4515,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4624,13 +4546,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4724,12 +4646,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4752,12 +4674,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -5078,14 +5000,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB95_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5111,14 +5033,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB95_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5742,14 +5664,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5778,14 +5700,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB111_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5888,14 +5810,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB113_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5924,14 +5846,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB113_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -6105,12 +6027,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6133,12 +6055,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6228,13 +6150,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6258,13 +6180,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6358,13 +6280,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6388,13 +6310,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6489,13 +6411,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6518,12 +6440,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6617,13 +6539,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6646,12 +6568,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6822,14 +6744,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6852,14 +6774,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7102,14 +7024,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7137,14 +7059,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7246,14 +7168,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7281,14 +7203,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7461,12 +7383,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7489,12 +7411,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7584,13 +7506,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7615,13 +7537,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7716,13 +7638,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7747,13 +7669,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7853,13 +7775,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7882,12 +7804,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7989,13 +7911,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8018,12 +7940,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8140,13 +8062,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8273,13 +8195,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8307,13 +8229,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8443,13 +8365,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8477,13 +8399,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8613,13 +8535,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8647,13 +8569,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8788,13 +8710,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8822,13 +8744,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8968,13 +8890,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9002,13 +8924,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9431,13 +9353,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9461,12 +9383,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10160,13 +10082,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10192,13 +10114,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10295,12 +10217,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10324,12 +10246,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10740,14 +10662,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB201_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10774,14 +10696,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB201_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11507,14 +11429,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB219_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11544,14 +11466,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB219_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11657,14 +11579,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB221_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11694,14 +11616,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB221_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11882,12 +11804,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11911,12 +11833,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12009,13 +11931,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12040,13 +11962,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12143,13 +12065,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12174,13 +12096,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12278,13 +12200,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12308,12 +12230,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12410,13 +12332,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12440,12 +12362,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12622,14 +12544,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12653,14 +12575,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12912,14 +12834,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12948,14 +12870,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13060,14 +12982,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -13096,14 +13018,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13283,12 +13205,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13312,12 +13234,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13410,13 +13332,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13442,13 +13364,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13546,13 +13468,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13578,13 +13500,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13687,13 +13609,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13717,12 +13639,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13827,13 +13749,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13857,12 +13779,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13982,13 +13904,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14119,13 +14041,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14154,13 +14076,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14293,13 +14215,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14328,13 +14250,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14467,13 +14389,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14502,13 +14424,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14646,13 +14568,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14681,13 +14603,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14830,13 +14752,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14865,13 +14787,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ebbeab94066d6..9e240238c1066 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v35 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 0c5fd1fc0932a..63b7b70548baf 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -180,63 +180,55 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -321,65 +313,57 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[4:35] +; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_accvgpr_read_b32 v0, a32 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a33 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a34 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a35 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -464,65 +448,57 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[32:63] +; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_accvgpr_read_b32 v0, a60 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a61 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a62 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a63 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a56 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a57 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a58 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a59 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a52 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a53 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a54 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a55 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a48 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a49 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a50 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a51 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a44 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a45 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a46 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a47 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a40 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a41 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a42 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a43 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a36 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a37 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a38 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a39 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a32 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a33 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a34 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a35 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -609,63 +585,55 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -750,65 +718,57 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[64:95] +; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_accvgpr_read_b32 v0, a92 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a93 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a94 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a95 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a88 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a89 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a90 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a91 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a84 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a85 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a86 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a87 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a80 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a81 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a82 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a83 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a76 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a77 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a78 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a79 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a72 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a73 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a74 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a75 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a68 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a69 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a70 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a71 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_read_b32 v0, a64 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a65 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a66 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a67 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -889,125 +849,61 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def a[0:31] -; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX908-NEXT: s_mov_b32 s32, 0 -; GFX908-NEXT: v_accvgpr_read_b32 v95, a0 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v94, a1 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v93, a2 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v92, a3 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v91, a4 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v90, a5 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v89, a6 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v88, a7 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v79, a8 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v78, a9 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v77, a10 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v76, a11 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v75, a12 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v74, a13 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v73, a14 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v72, a15 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v60, a19 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v59, a20 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v58, a21 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v57, a22 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v56, a23 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v47, a24 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v46, a25 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v45, a26 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v44, a27 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v43, a28 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v42, a29 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v41, a30 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v40, a31 ; Reload Reuse +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def a[0:31] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v43, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v47, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v46, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v45, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v44, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v58, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v57, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v56, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v63, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v62, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v60, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v75, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v74, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v73, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v72, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v79, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v78, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v77, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v76, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v91, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v90, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v89, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v88, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v95, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v94, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v93, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v92, a28 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: v_mov_b32_e32 v4, v95 -; GFX908-NEXT: v_mov_b32_e32 v5, v94 -; GFX908-NEXT: v_mov_b32_e32 v6, v93 -; GFX908-NEXT: v_mov_b32_e32 v7, v92 -; GFX908-NEXT: v_mov_b32_e32 v8, v91 -; GFX908-NEXT: v_mov_b32_e32 v9, v90 -; GFX908-NEXT: v_mov_b32_e32 v10, v89 -; GFX908-NEXT: v_mov_b32_e32 v11, v88 -; GFX908-NEXT: v_mov_b32_e32 v12, v79 -; GFX908-NEXT: v_mov_b32_e32 v13, v78 -; GFX908-NEXT: v_mov_b32_e32 v14, v77 -; GFX908-NEXT: v_mov_b32_e32 v15, v76 -; GFX908-NEXT: v_mov_b32_e32 v16, v75 -; GFX908-NEXT: v_mov_b32_e32 v17, v74 -; GFX908-NEXT: v_mov_b32_e32 v18, v73 -; GFX908-NEXT: v_mov_b32_e32 v19, v72 -; GFX908-NEXT: v_mov_b32_e32 v20, v63 -; GFX908-NEXT: v_mov_b32_e32 v21, v62 -; GFX908-NEXT: v_mov_b32_e32 v22, v61 -; GFX908-NEXT: v_mov_b32_e32 v23, v60 -; GFX908-NEXT: v_mov_b32_e32 v24, v59 -; GFX908-NEXT: v_mov_b32_e32 v25, v58 -; GFX908-NEXT: v_mov_b32_e32 v26, v57 -; GFX908-NEXT: v_mov_b32_e32 v27, v56 -; GFX908-NEXT: v_mov_b32_e32 v28, v47 -; GFX908-NEXT: v_mov_b32_e32 v29, v46 -; GFX908-NEXT: v_mov_b32_e32 v30, v45 -; GFX908-NEXT: v_mov_b32_e32 v31, v44 -; GFX908-NEXT: v_mov_b32_e32 v32, v43 -; GFX908-NEXT: v_mov_b32_e32 v33, v42 -; GFX908-NEXT: v_mov_b32_e32 v34, v41 -; GFX908-NEXT: v_mov_b32_e32 v35, v40 -; GFX908-NEXT: v_mov_b32_e32 v0, v32 -; GFX908-NEXT: v_mov_b32_e32 v1, v33 -; GFX908-NEXT: v_mov_b32_e32 v2, v34 -; GFX908-NEXT: v_mov_b32_e32 v3, v35 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v28 -; GFX908-NEXT: v_mov_b32_e32 v1, v29 -; GFX908-NEXT: v_mov_b32_e32 v2, v30 -; GFX908-NEXT: v_mov_b32_e32 v3, v31 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v24 -; GFX908-NEXT: v_mov_b32_e32 v1, v25 -; GFX908-NEXT: v_mov_b32_e32 v2, v26 -; GFX908-NEXT: v_mov_b32_e32 v3, v27 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v20 -; GFX908-NEXT: v_mov_b32_e32 v1, v21 -; GFX908-NEXT: v_mov_b32_e32 v2, v22 -; GFX908-NEXT: v_mov_b32_e32 v3, v23 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v16 -; GFX908-NEXT: v_mov_b32_e32 v1, v17 -; GFX908-NEXT: v_mov_b32_e32 v2, v18 -; GFX908-NEXT: v_mov_b32_e32 v3, v19 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v12 -; GFX908-NEXT: v_mov_b32_e32 v1, v13 -; GFX908-NEXT: v_mov_b32_e32 v2, v14 -; GFX908-NEXT: v_mov_b32_e32 v3, v15 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v8 -; GFX908-NEXT: v_mov_b32_e32 v1, v9 -; GFX908-NEXT: v_mov_b32_e32 v2, v10 -; GFX908-NEXT: v_mov_b32_e32 v3, v11 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: v_mov_b32_e32 v2, v6 -; GFX908-NEXT: v_mov_b32_e32 v3, v7 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[92:95], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[88:91], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[76:79], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[72:75], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[60:63], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[56:59], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[44:47], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[40:43], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll index 1180fc7b35a0b..1a2dd6e5f90f6 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -29,17 +29,17 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32 ; GFX908-LABEL: remat_regcopy_avoids_spill: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v1 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_write_b32 a3, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 2cbf39e2464bc..c3b14e8829042 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -423,8 +423,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: buffer_atomic_add_f32 v6, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -432,7 +431,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -542,8 +541,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: buffer_atomic_add_f32 v6, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -551,7 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2443,8 +2441,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 @@ -2458,7 +2456,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -2610,8 +2607,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -2623,7 +2620,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -4489,6 +4485,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4502,7 +4499,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -4778,6 +4774,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -4789,7 +4786,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6352,6 +6348,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6364,7 +6361,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -6678,6 +6674,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6689,7 +6686,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -7532,8 +7528,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -7541,7 +7536,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -7687,8 +7682,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -7696,7 +7690,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9981,6 +9975,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -9993,7 +9988,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_4 @@ -10307,6 +10301,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -10318,7 +10313,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 187c8c9c11fa3..f7a1fb35c8106 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -429,6 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX942-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -442,7 +443,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 @@ -549,6 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -560,7 +561,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 @@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1668,7 +1668,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -1784,8 +1783,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -1797,7 +1796,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -3605,6 +3603,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3618,7 +3617,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 @@ -3904,6 +3902,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3915,7 +3914,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 @@ -5486,6 +5484,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -5498,7 +5497,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -5812,6 +5810,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -5823,7 +5822,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6878,6 +6876,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6890,7 +6889,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -7070,6 +7068,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7081,7 +7080,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -8667,6 +8665,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8679,7 +8678,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 @@ -8993,6 +8991,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9004,7 +9003,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index acbea3921b616..8ac6353133e72 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -429,6 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX942-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -442,7 +443,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 @@ -549,6 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 ; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -560,7 +561,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 @@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1668,7 +1668,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ; implicit-def: $vgpr4 @@ -1784,8 +1783,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -1797,7 +1796,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 @@ -3605,6 +3603,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3618,7 +3617,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 @@ -3904,6 +3902,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3915,7 +3914,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 @@ -5486,6 +5484,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -5498,7 +5497,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 @@ -5812,6 +5810,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -5823,7 +5822,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 @@ -6878,6 +6876,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6890,7 +6889,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 @@ -7070,6 +7068,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7081,7 +7080,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 @@ -8667,6 +8665,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8679,7 +8678,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 @@ -8993,6 +8991,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9004,7 +9003,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 0199e2866b35d..3c991cfb7a1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -258,59 +258,68 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_nop 1 +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -431,58 +440,46 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s0, v1 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v2, s[8:11], 0 offen -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v2, s[8:11], 0 offen offset:16 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v2, s[8:11], 0 offen offset:32 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v2, s[8:11], 0 offen offset:48 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v2, s[8:11], 0 offen offset:64 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v2, s[8:11], 0 offen offset:80 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v2, s[8:11], 0 offen offset:96 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v2, s[8:11], 0 offen offset:112 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v2, s[8:11], 0 offen offset:128 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v2, s[8:11], 0 offen offset:144 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v2, s[8:11], 0 offen offset:160 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v2, s[8:11], 0 offen offset:176 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v2, s[8:11], 0 offen offset:192 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v2, s[8:11], 0 offen offset:208 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v2, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v2, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s12, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v2, s[4:7], 0 offen -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v2, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v2, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v2, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v2, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v2, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v2, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v2, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v2, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v2, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v2, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v2, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v2, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v2, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v2, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v2, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; GISEL-GFX942-NEXT: s_endpgm @@ -823,41 +820,30 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s8, v0 +; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0 ; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 ; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse +; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -991,43 +977,32 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s12, v0 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0 ; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse +; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; GISEL-GFX942-NEXT: s_endpgm @@ -1171,8 +1146,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen +; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen ; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; SDAG-GFX942-NEXT: s_mov_b32 s5, s12 @@ -1183,12 +1158,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_endpgm ; ; SDAG-GFX1100-LABEL: memcpy_known_small: @@ -1242,8 +1217,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen +; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 ; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 @@ -1254,12 +1229,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 ; GISEL-GFX942-NEXT: s_endpgm ; ; GISEL-GFX1100-LABEL: memcpy_known_small: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 8b998354b1f4f..683887b0a55f3 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -426,126 +426,122 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-LABEL: ds_write2_b32_av_av_no_vgprs: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a0, v0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def a1 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a34 +; GCN-NEXT: ; def a2 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_accvgpr_write_b32 a33, v31 -; GCN-NEXT: v_accvgpr_write_b32 a32, v30 -; GCN-NEXT: v_accvgpr_write_b32 a31, v29 -; GCN-NEXT: v_accvgpr_write_b32 a30, v28 -; GCN-NEXT: v_accvgpr_write_b32 a29, v27 -; GCN-NEXT: v_accvgpr_write_b32 a28, v26 -; GCN-NEXT: v_accvgpr_write_b32 a27, v25 -; GCN-NEXT: v_accvgpr_write_b32 a26, v24 -; GCN-NEXT: v_accvgpr_write_b32 a25, v23 -; GCN-NEXT: v_accvgpr_write_b32 a24, v22 -; GCN-NEXT: v_accvgpr_write_b32 a23, v21 -; GCN-NEXT: v_accvgpr_write_b32 a22, v20 -; GCN-NEXT: v_accvgpr_write_b32 a21, v19 -; GCN-NEXT: v_accvgpr_write_b32 a20, v18 -; GCN-NEXT: v_accvgpr_write_b32 a19, v17 -; GCN-NEXT: v_accvgpr_write_b32 a18, v16 -; GCN-NEXT: v_accvgpr_write_b32 a17, v15 -; GCN-NEXT: v_accvgpr_write_b32 a16, v14 -; GCN-NEXT: v_accvgpr_write_b32 a15, v13 -; GCN-NEXT: v_accvgpr_write_b32 a14, v12 -; GCN-NEXT: v_accvgpr_write_b32 a13, v11 -; GCN-NEXT: v_accvgpr_write_b32 a12, v10 -; GCN-NEXT: v_accvgpr_write_b32 a11, v9 -; GCN-NEXT: v_accvgpr_write_b32 a10, v8 -; GCN-NEXT: v_accvgpr_write_b32 a9, v7 -; GCN-NEXT: v_accvgpr_write_b32 a8, v6 -; GCN-NEXT: v_accvgpr_write_b32 a7, v5 -; GCN-NEXT: v_accvgpr_write_b32 a6, v4 -; GCN-NEXT: v_accvgpr_write_b32 a5, v3 -; GCN-NEXT: v_accvgpr_write_b32 a4, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a34 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_read_b32 v0, a2 -; GCN-NEXT: v_accvgpr_read_b32 v1, a3 -; GCN-NEXT: v_accvgpr_read_b32 v2, a4 -; GCN-NEXT: v_accvgpr_read_b32 v3, a5 -; GCN-NEXT: v_accvgpr_read_b32 v4, a6 -; GCN-NEXT: v_accvgpr_read_b32 v5, a7 -; GCN-NEXT: v_accvgpr_read_b32 v6, a8 -; GCN-NEXT: v_accvgpr_read_b32 v7, a9 -; GCN-NEXT: v_accvgpr_read_b32 v8, a10 -; GCN-NEXT: v_accvgpr_read_b32 v9, a11 -; GCN-NEXT: v_accvgpr_read_b32 v10, a12 -; GCN-NEXT: v_accvgpr_read_b32 v11, a13 -; GCN-NEXT: v_accvgpr_read_b32 v12, a14 -; GCN-NEXT: v_accvgpr_read_b32 v13, a15 -; GCN-NEXT: v_accvgpr_read_b32 v14, a16 -; GCN-NEXT: v_accvgpr_read_b32 v15, a17 -; GCN-NEXT: v_accvgpr_read_b32 v16, a18 -; GCN-NEXT: v_accvgpr_read_b32 v17, a19 -; GCN-NEXT: v_accvgpr_read_b32 v18, a20 -; GCN-NEXT: v_accvgpr_read_b32 v19, a21 -; GCN-NEXT: v_accvgpr_read_b32 v20, a22 -; GCN-NEXT: v_accvgpr_read_b32 v21, a23 -; GCN-NEXT: v_accvgpr_read_b32 v22, a24 -; GCN-NEXT: v_accvgpr_read_b32 v23, a25 -; GCN-NEXT: v_accvgpr_read_b32 v24, a26 -; GCN-NEXT: v_accvgpr_read_b32 v25, a27 -; GCN-NEXT: v_accvgpr_read_b32 v26, a28 -; GCN-NEXT: v_accvgpr_read_b32 v27, a29 -; GCN-NEXT: v_accvgpr_read_b32 v28, a30 -; GCN-NEXT: v_accvgpr_read_b32 v29, a31 -; GCN-NEXT: v_accvgpr_read_b32 v30, a32 -; GCN-NEXT: v_accvgpr_read_b32 v31, a33 +; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24 @@ -980,133 +976,123 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword a37, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a6, v41 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a7, v42 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a8, v43 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a9, v44 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a10, v45 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a11, v46 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a12, v47 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a13, v56 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a14, v57 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a15, v58 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a16, v59 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a17, v60 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a18, v61 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a20, v63 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a0, v0 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a[34:35] +; GCN-NEXT: ; def a[2:3] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def a[36:37] +; GCN-NEXT: ; def a[4:5] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_accvgpr_write_b32 a33, v31 -; GCN-NEXT: v_accvgpr_write_b32 a32, v30 -; GCN-NEXT: v_accvgpr_write_b32 a31, v29 -; GCN-NEXT: v_accvgpr_write_b32 a30, v28 -; GCN-NEXT: v_accvgpr_write_b32 a29, v27 -; GCN-NEXT: v_accvgpr_write_b32 a28, v26 -; GCN-NEXT: v_accvgpr_write_b32 a27, v25 -; GCN-NEXT: v_accvgpr_write_b32 a26, v24 -; GCN-NEXT: v_accvgpr_write_b32 a25, v23 -; GCN-NEXT: v_accvgpr_write_b32 a24, v22 -; GCN-NEXT: v_accvgpr_write_b32 a23, v21 -; GCN-NEXT: v_accvgpr_write_b32 a22, v20 -; GCN-NEXT: v_accvgpr_write_b32 a21, v19 -; GCN-NEXT: v_accvgpr_write_b32 a20, v18 -; GCN-NEXT: v_accvgpr_write_b32 a19, v17 -; GCN-NEXT: v_accvgpr_write_b32 a18, v16 -; GCN-NEXT: v_accvgpr_write_b32 a17, v15 -; GCN-NEXT: v_accvgpr_write_b32 a16, v14 -; GCN-NEXT: v_accvgpr_write_b32 a15, v13 -; GCN-NEXT: v_accvgpr_write_b32 a14, v12 -; GCN-NEXT: v_accvgpr_write_b32 a13, v11 -; GCN-NEXT: v_accvgpr_write_b32 a12, v10 -; GCN-NEXT: v_accvgpr_write_b32 a11, v9 -; GCN-NEXT: v_accvgpr_write_b32 a10, v8 -; GCN-NEXT: v_accvgpr_write_b32 a9, v7 -; GCN-NEXT: v_accvgpr_write_b32 a8, v6 -; GCN-NEXT: v_accvgpr_write_b32 a7, v5 -; GCN-NEXT: v_accvgpr_write_b32 a6, v4 -; GCN-NEXT: v_accvgpr_write_b32 a5, v3 -; GCN-NEXT: v_accvgpr_write_b32 a4, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v0 -; GCN-NEXT: v_accvgpr_read_b32 v2, a34 -; GCN-NEXT: v_accvgpr_read_b32 v4, a36 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v3, a35 -; GCN-NEXT: v_accvgpr_read_b32 v5, a37 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 ; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_read_b32 v0, a2 -; GCN-NEXT: v_accvgpr_read_b32 v1, a3 -; GCN-NEXT: v_accvgpr_read_b32 v2, a4 -; GCN-NEXT: v_accvgpr_read_b32 v3, a5 -; GCN-NEXT: v_accvgpr_read_b32 v4, a6 -; GCN-NEXT: v_accvgpr_read_b32 v5, a7 -; GCN-NEXT: v_accvgpr_read_b32 v6, a8 -; GCN-NEXT: v_accvgpr_read_b32 v7, a9 -; GCN-NEXT: v_accvgpr_read_b32 v8, a10 -; GCN-NEXT: v_accvgpr_read_b32 v9, a11 -; GCN-NEXT: v_accvgpr_read_b32 v10, a12 -; GCN-NEXT: v_accvgpr_read_b32 v11, a13 -; GCN-NEXT: v_accvgpr_read_b32 v12, a14 -; GCN-NEXT: v_accvgpr_read_b32 v13, a15 -; GCN-NEXT: v_accvgpr_read_b32 v14, a16 -; GCN-NEXT: v_accvgpr_read_b32 v15, a17 -; GCN-NEXT: v_accvgpr_read_b32 v16, a18 -; GCN-NEXT: v_accvgpr_read_b32 v17, a19 -; GCN-NEXT: v_accvgpr_read_b32 v18, a20 -; GCN-NEXT: v_accvgpr_read_b32 v19, a21 -; GCN-NEXT: v_accvgpr_read_b32 v20, a22 -; GCN-NEXT: v_accvgpr_read_b32 v21, a23 -; GCN-NEXT: v_accvgpr_read_b32 v22, a24 -; GCN-NEXT: v_accvgpr_read_b32 v23, a25 -; GCN-NEXT: v_accvgpr_read_b32 v24, a26 -; GCN-NEXT: v_accvgpr_read_b32 v25, a27 -; GCN-NEXT: v_accvgpr_read_b32 v26, a28 -; GCN-NEXT: v_accvgpr_read_b32 v27, a29 -; GCN-NEXT: v_accvgpr_read_b32 v28, a30 -; GCN-NEXT: v_accvgpr_read_b32 v29, a31 -; GCN-NEXT: v_accvgpr_read_b32 v30, a32 -; GCN-NEXT: v_accvgpr_read_b32 v31, a33 +; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a28, v24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a27, v25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a26, v26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a25, v27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: v_accvgpr_read_b32 v21, a31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v22, a30 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v23, a29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v24, a28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v25, a27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v26, a26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v27, a25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v28, a24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v29, a23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v30, a22 ; Reload Reuse +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_accvgpr_read_b32 v31, a21 ; Reload Reuse ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword a37, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: v_accvgpr_read_b32 v63, a20 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v60, a17 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v59, a16 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v57, a14 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v56, a13 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v47, a12 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v46, a11 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v45, a10 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v44, a9 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v43, a8 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v42, a7 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v41, a6 ; Reload Reuse ; GCN-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 24 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index af817c3ee4eb1..1e7855ccb3642 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -1012,6 +1012,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private @@ -1044,6 +1045,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private @@ -1167,6 +1169,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private @@ -1203,6 +1206,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index d973f7b71fb6d..57bfd2490f9da 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1] +; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half8: @@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1] +; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3] +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half6: diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index 554d4f69ea4a2..597f90c0f4e84 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -43,7 +43,8 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy a1 to s9 +; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 +; GCN: ; illegal copy [[COPY1]] to s9 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) @@ -51,7 +52,9 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy -; GCN: ; illegal copy a[0:1] to s[10:11] +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 +; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11] define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index 364d2f52777d3..b91963f08681c 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_barrier -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ds_read_b32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: barrier_release: @@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_barrier ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-LABEL: barrier_release: @@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_barrier -; GFX942-NEXT: ds_read_b32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: ds_read_b32 v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: barrier_release: @@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_barrier ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX10WGP-LABEL: barrier_release: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index a57b43a81205b..3e96dfe40f745 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -37,11 +37,11 @@ entry: define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX950-SDAG-LABEL: ds_read_b96_tr_b6: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off +; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: ds_read_b96_tr_b6: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index fb32a83f3cf3c..7959cee49b93f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b32 v2, v1 +; GCN-NEXT: ds_read_b32 v1, v1 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v0, v2 +; GCN-NEXT: ds_write_b32 v0, v1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ds_read_b32 v1, v1 offset:256 -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_read_b32 v0, v2 offset:256 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v0, v1 offset:256 +; GCN-NEXT: ds_write_b32 v1, v0 offset:256 ; GCN-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index efd5df85280e6..49607e320bd0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -39,7 +39,9 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -68,10 +70,14 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[2:3] +; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() @@ -86,7 +92,8 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -99,7 +106,8 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -115,7 +123,9 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 % ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -129,7 +139,9 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %data = call i64 asm "; def $0", "=a"() %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -142,10 +154,14 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[2:3] +; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll index 92a5f88246888..12a998ad82cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -89,59 +89,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm @@ -255,25 +255,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 9 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 ; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; GFX908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; GFX908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX908-NEXT: s_endpgm @@ -422,22 +422,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 ; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll index c21d86684e445..87a7c2ef6c95c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll @@ -8,10 +8,10 @@ define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg ; HEURRC-LABEL: default: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -34,10 +34,10 @@ define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> ; HEURRC-LABEL: request_agpr: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 22bc62acce15d..5ab8706f28f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 ; GFX90A-VGPR-NEXT: s_nop 3 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-VGPR-NEXT: s_nop 7 ; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0 +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s11 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s11 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 @@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 @@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 @@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 @@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s10 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10 ; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s11 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11 +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 @@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s10 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10 ; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s11 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 @@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index bc4822ef32a3d..dc4c929124fec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1467,38 +1467,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1507,18 +1507,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX942-AGPRCD: ; %bb.0: ; %bb @@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1606,11 +1606,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1621,7 +1621,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1664,11 +1664,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1679,7 +1679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1869,38 +1869,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1909,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX942-AGPRCD: ; %bb.0: ; %bb @@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -2008,11 +2008,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -2066,11 +2066,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2081,7 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -2275,21 +2275,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2322,21 +2322,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -2495,15 +2495,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2512,7 +2512,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -2560,15 +2560,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -2789,21 +2789,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2836,21 +2836,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3000,21 +3000,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -3047,21 +3047,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3211,21 +3211,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3258,21 +3258,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3422,21 +3422,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3469,21 +3469,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13] +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -3642,15 +3642,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3659,7 +3659,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -3707,15 +3707,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3724,7 +3724,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -3945,15 +3945,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3962,7 +3962,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4010,15 +4010,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4027,7 +4027,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4248,15 +4248,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4265,7 +4265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4313,15 +4313,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4330,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4551,15 +4551,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4568,7 +4568,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] @@ -4616,15 +4616,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c ; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 68e3afe8b449a..033a35f69a0bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 -; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 +; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 +; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 +; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -41,39 +41,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -87,15 +88,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 -; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 +; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 +; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 +; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -113,39 +114,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) @@ -158,22 +160,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 11 @@ -202,22 +204,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 03bf33e0d17e6..753206206180a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -12,45 +12,29 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, ; -------------------------------------------------------------------- define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_16x16x32_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_16x16x32_f16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_16x16x32_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -90,45 +74,29 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg } define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 @@ -414,15 +382,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -440,39 +408,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -480,15 +449,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -504,33 +473,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -538,15 +508,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -564,39 +534,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v8, s20 -; HEURRC-NEXT: v_mov_b32_e32 v9, s21 -; HEURRC-NEXT: v_mov_b32_e32 v10, s22 -; HEURRC-NEXT: v_mov_b32_e32 v11, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -604,15 +575,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -622,40 +593,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16: @@ -794,15 +765,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -820,39 +791,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -860,15 +832,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -884,33 +856,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -918,15 +891,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -944,39 +917,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v8, s20 -; HEURRC-NEXT: v_mov_b32_e32 v9, s21 -; HEURRC-NEXT: v_mov_b32_e32 v10, s22 -; HEURRC-NEXT: v_mov_b32_e32 v11, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -984,15 +958,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -1002,40 +976,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags: @@ -1170,105 +1144,65 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < } define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; HEURRC-NEXT: s_nop 11 @@ -1380,105 +1314,65 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half } define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 11 @@ -2642,45 +2536,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; SDAG-LABEL: test_mfma_i32_16x16x64_i8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_i32_16x16x64_i8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_i32_16x16x64_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -2720,45 +2598,29 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 } define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { -; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 @@ -3173,15 +3035,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3197,33 +3059,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3584,15 +3447,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 +; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3608,33 +3471,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3920,105 +3784,65 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; HEURRC-NEXT: s_nop 11 @@ -4130,105 +3954,65 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar } define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { -; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 11 @@ -5515,10 +5299,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-LABEL: test_mfma_f32_16x16x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; GCN-NEXT: s_nop 7 @@ -5531,10 +5315,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] ; HEURRC-NEXT: s_nop 7 @@ -5577,10 +5361,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 @@ -5593,10 +5377,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 ; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 ; HEURRC-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll index c1946630ef5f1..d24f1f0b526c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll @@ -52,26 +52,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a3 -; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 -; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 -; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_32x32x8i8: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 3d9ebf91e8f47..7e30af96bb8b9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -99,59 +99,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -234,59 +234,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -510,25 +510,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm @@ -577,25 +577,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm @@ -864,22 +864,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -931,22 +931,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -1257,59 +1257,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -1396,59 +1396,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -1690,25 +1690,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm @@ -1760,25 +1760,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm @@ -2080,22 +2080,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -2150,22 +2150,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2482,7 +2482,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2500,67 +2500,53 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2617,7 +2603,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2626,7 +2612,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2635,61 +2621,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_32x32x4i8: @@ -2871,134 +2843,134 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8: @@ -3123,37 +3095,30 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3161,33 +3126,30 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3632,59 +3594,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm @@ -3768,59 +3730,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm @@ -4049,22 +4011,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -4116,22 +4078,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 @@ -4478,32 +4440,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4516,28 +4478,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -4622,32 +4584,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4659,31 +4621,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat: @@ -4787,60 +4751,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -4850,55 +4814,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:80 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:64 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -5091,32 +5055,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -5145,32 +5109,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -5313,60 +5277,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm @@ -5412,60 +5376,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm @@ -5916,40 +5880,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -6011,40 +5975,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 52dcfb735a899..aae14c8cc87b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -14,37 +14,21 @@ ; fp8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -53,37 +37,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -92,37 +60,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -131,37 +83,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -170,37 +106,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -209,37 +129,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -248,37 +152,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -287,37 +175,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -327,37 +199,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -367,37 +223,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; fp8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -407,37 +247,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -447,37 +271,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -487,37 +295,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -527,37 +319,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; fp8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -567,37 +343,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 3, ; blgp @@ -607,37 +367,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; fp8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -647,37 +391,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -687,37 +415,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; bf8 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -727,37 +439,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 0, ; blgp @@ -767,37 +463,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; bf8 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp @@ -808,37 +488,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 1, ; blgp @@ -848,37 +512,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; bf8 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -887,37 +535,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 2, ; blgp @@ -927,37 +559,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; bf8 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -967,37 +583,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 3, ; blgp @@ -1007,37 +607,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; bf8 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -1047,37 +631,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz i32 4, ; blgp @@ -1087,37 +655,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; fp6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -1127,37 +679,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 0, ; blgp @@ -1167,37 +703,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; fp6 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -1207,37 +727,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 1, ; blgp @@ -1247,37 +751,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; fp6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -1287,37 +775,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 2, ; blgp @@ -1327,37 +799,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; fp6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -1367,37 +823,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 3, ; blgp @@ -1408,37 +848,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; bf6 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -1448,37 +872,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 0, ; blgp @@ -1488,37 +896,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; bf6 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -1528,37 +920,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 1, ; blgp @@ -1568,37 +944,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; bf6 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -1608,37 +968,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 2, ; blgp @@ -1648,37 +992,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; bf6 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -1688,37 +1016,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 4, ; blgp @@ -1728,37 +1040,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; bf6 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -1768,37 +1064,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz i32 3, ; blgp @@ -1808,37 +1088,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; fp6 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1848,37 +1112,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz i32 4, ; blgp @@ -1888,37 +1136,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; fp4 x fp8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1928,37 +1160,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -1968,37 +1184,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; fp4 x bf8 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -2008,37 +1208,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 1, ; blgp @@ -2048,77 +1232,45 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; fp4 x fp6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, - i32 4, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 2, ; blgp @@ -2128,37 +1280,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; fp4 x bf6 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -2168,37 +1304,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 3, ; blgp @@ -2208,37 +1328,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; fp4 x fp4 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -2248,37 +1352,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -2291,17 +1379,97 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; -------------------------------------------------------------------- define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s24 +; SDAG-NEXT: v_mov_b32_e32 v11, s25 +; SDAG-NEXT: v_mov_b32_e32 v12, s26 +; SDAG-NEXT: v_mov_b32_e32 v13, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v5 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2309,17 +1477,29 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, s0 -; GISEL-NEXT: v_mov_b32_e32 v17, s1 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b32_e32 v20, s28 +; GISEL-NEXT: v_mov_b32_e32 v21, s29 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2330,148 +1510,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, s0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, s0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v12, s0 -; SDAG-NEXT: v_mov_b32_e32 v13, s1 -; SDAG-NEXT: v_mov_b32_e32 v14, s2 -; SDAG-NEXT: v_mov_b32_e32 v15, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_mov_b32_e32 v8, s24 -; SDAG-NEXT: v_mov_b32_e32 v9, s25 -; SDAG-NEXT: v_mov_b32_e32 v10, s26 -; SDAG-NEXT: v_mov_b32_e32 v11, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 -; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 -; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v14, s0 @@ -2482,10 +1522,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] @@ -2536,10 +1576,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -2582,10 +1622,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -2594,6 +1630,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -2612,13 +1652,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] @@ -2711,14 +1751,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 33 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, -2 -; SDAG-NEXT: v_mov_b32_e32 v17, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2751,14 +1791,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, -2 -; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2791,14 +1831,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d -; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3145,328 +2185,58 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result } ; This should be optimized to avoid the scale, with non-0 op_sel arguments. define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 1 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v17, 1 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 1 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 1 -; GISEL-NEXT: v_mov_b32_e32 v17, 0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) - ret <4 x float> %result -} - -; -------------------------------------------------------------------- -; Incorrect signature for format cases (IR vector too large) -; -------------------------------------------------------------------- - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <4 x float> %result -} - -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, 1 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3474,38 +2244,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mov_b32_e32 v17, 1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } -define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 1 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3513,21 +2284,162 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 1 +; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 ; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <4 x float> %result +} + +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -3536,37 +2448,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz i32 4, ; blgp @@ -3575,37 +2471,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 0, ; blgp @@ -3614,37 +2494,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp @@ -3653,37 +2517,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 7b7865e3434db..f0205a3a788ed 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -17,27 +17,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -61,11 +61,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -81,7 +81,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -112,27 +112,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -156,11 +156,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -176,7 +176,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -207,27 +207,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -251,11 +251,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -271,7 +271,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -302,27 +302,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -346,11 +346,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -366,7 +366,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -397,27 +397,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -441,11 +441,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -461,7 +461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -492,27 +492,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -536,11 +536,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -556,7 +556,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -587,27 +587,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -631,11 +631,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -651,7 +651,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -682,27 +682,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -726,11 +726,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -746,7 +746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -775,89 +775,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; This should be optimized to avoid the scale define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 0, ; blgp @@ -870,27 +828,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -914,11 +872,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -934,7 +892,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -962,89 +920,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 1, ; blgp @@ -1054,91 +970,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; fp8 x fp6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -1147,87 +1020,46 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz i32 2, ; blgp @@ -1237,29 +1069,226 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; fp8 x bf6 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1280,29 +1309,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1322,307 +1352,88 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; bf8 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1643,14 +1454,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -1666,7 +1477,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1688,3087 +1499,1992 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 0, ; blgp + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 0, ; blgp + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; bf8 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 1, ; blgp + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; bf8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 1, ; blgp - i32 0, i32 0, i32 0, i32 0) + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -; bf8 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 2, ; blgp + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz - i32 2, ; blgp + i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 3, ; blgp +; fp6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 3, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf8 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 4, ; blgp +; fp6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 1, ; cbsz - i32 4, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; fp6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 0, ; blgp + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 0, ; blgp + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; fp6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 1, ; blgp + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz - i32 1, ; blgp + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp + +; bf6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 3, ; blgp +; bf6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 3, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } - -; bf6 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; bf6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 0, ; blgp + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 0, ; blgp + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; bf6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 1, ; blgp + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 1, ; blgp + i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; bf6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 2, ; blgp + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz - i32 2, ; blgp + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; fp6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz + i32 2, ; cbsz i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz + i32 2, ; cbsz i32 4, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; bf6 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 3, ; blgp +; fp4 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 3, ; cbsz - i32 3, ; blgp +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp6 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 4, ; blgp +; fp4 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -; fp4 x fp8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; fp4 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 0, ; blgp + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp4 x bf8 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +; fp4 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 1, ; blgp + i32 3, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 1, ; blgp + i32 3, ; blgp i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -; fp4 x fp6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, +; fp4 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz - i32 2, ; blgp + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; -------------------------------------------------------------------- +; Different input parameter classes +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s20 +; SDAG-NEXT: v_mov_b32_e32 v19, s21 +; SDAG-NEXT: v_mov_b32_e32 v20, s22 +; SDAG-NEXT: v_mov_b32_e32 v21, s23 +; SDAG-NEXT: v_mov_b32_e32 v22, s24 +; SDAG-NEXT: v_mov_b32_e32 v23, s25 +; SDAG-NEXT: v_mov_b32_e32 v24, s26 +; SDAG-NEXT: v_mov_b32_e32 v25, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4787,28 +3503,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v32, s28 +; GISEL-NEXT: v_mov_b32_e32 v33, s29 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v33 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4826,37 +3557,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -; fp4 x bf6 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4875,115 +3612,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 3, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v10 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v25 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5001,37 +3661,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 3, ; blgp - i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -; fp4 x fp4 -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5050,9 +3716,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 @@ -5069,9 +3743,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5089,36 +3765,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 4, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5137,9 +3820,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 @@ -5156,9 +3847,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5176,55 +3869,109 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 4, ; blgp - i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -; -------------------------------------------------------------------- -; Different input parameter classes -; -------------------------------------------------------------------- +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_accvgpr_write_b32 a4, s16 +; GCN-NEXT: v_accvgpr_write_b32 a5, s17 +; GCN-NEXT: v_accvgpr_write_b32 a6, s18 +; GCN-NEXT: v_accvgpr_write_b32 a7, s19 +; GCN-NEXT: v_accvgpr_write_b32 a8, s20 +; GCN-NEXT: v_accvgpr_write_b32 a9, s21 +; GCN-NEXT: v_accvgpr_write_b32 a10, s22 +; GCN-NEXT: v_accvgpr_write_b32 a11, s23 +; GCN-NEXT: v_accvgpr_write_b32 a12, s24 +; GCN-NEXT: v_accvgpr_write_b32 a13, s25 +; GCN-NEXT: v_accvgpr_write_b32 a14, s26 +; GCN-NEXT: v_accvgpr_write_b32 a15, s27 +; GCN-NEXT: v_mov_b32_e32 v17, s28 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v24, s28 +; SDAG-NEXT: v_mov_b32_e32 v25, s29 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 @@ -5232,30 +3979,45 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, s0 -; GISEL-NEXT: v_mov_b32_e32 v17, s1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v24, s20 +; GISEL-NEXT: v_mov_b32_e32 v25, s21 +; GISEL-NEXT: v_mov_b32_e32 v26, s22 +; GISEL-NEXT: v_mov_b32_e32 v27, s23 +; GISEL-NEXT: v_mov_b32_e32 v28, s24 +; GISEL-NEXT: v_mov_b32_e32 v29, s25 +; GISEL-NEXT: v_mov_b32_e32 v30, s26 +; GISEL-NEXT: v_mov_b32_e32 v31, s27 +; GISEL-NEXT: v_mov_b32_e32 v32, s28 +; GISEL-NEXT: v_mov_b32_e32 v33, s29 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v31 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v32 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v33 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5279,31 +4041,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_mov_b32_e32 v17, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5324,12 +4086,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: v_mov_b32_e32 v31, 33 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 @@ -5344,10 +4107,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 ; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v17, s0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5367,35 +4129,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_mov_b32_e32 v17, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5416,12 +4178,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 @@ -5436,10 +4199,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 ; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v17, s0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5459,48 +4221,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 -; SDAG-NEXT: v_mov_b32_e32 v18, s2 -; SDAG-NEXT: v_mov_b32_e32 v19, s3 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s28 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s29 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5521,39 +4270,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a0, s28 -; GISEL-NEXT: v_accvgpr_write_b32 a1, s29 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5573,91 +4313,79 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v14, s16 -; SDAG-NEXT: v_mov_b32_e32 v15, s17 -; SDAG-NEXT: v_mov_b32_e32 v16, s18 -; SDAG-NEXT: v_mov_b32_e32 v17, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5677,41 +4405,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v14, s16 -; SDAG-NEXT: v_mov_b32_e32 v15, s17 -; SDAG-NEXT: v_mov_b32_e32 v16, s18 -; SDAG-NEXT: v_mov_b32_e32 v17, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5732,36 +4454,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5781,41 +4497,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v14, s16 -; SDAG-NEXT: v_mov_b32_e32 v15, s17 -; SDAG-NEXT: v_mov_b32_e32 v16, s18 -; SDAG-NEXT: v_mov_b32_e32 v17, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5836,36 +4546,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v8, s20 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -5885,2016 +4589,854 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s16 -; GCN-NEXT: v_accvgpr_write_b32 a5, s17 -; GCN-NEXT: v_accvgpr_write_b32 a6, s18 -; GCN-NEXT: v_accvgpr_write_b32 a7, s19 -; GCN-NEXT: v_accvgpr_write_b32 a8, s20 -; GCN-NEXT: v_accvgpr_write_b32 a9, s21 -; GCN-NEXT: v_accvgpr_write_b32 a10, s22 -; GCN-NEXT: v_accvgpr_write_b32 a11, s23 -; GCN-NEXT: v_accvgpr_write_b32 a12, s24 -; GCN-NEXT: v_accvgpr_write_b32 a13, s25 -; GCN-NEXT: v_accvgpr_write_b32 a14, s26 -; GCN-NEXT: v_accvgpr_write_b32 a15, s27 -; GCN-NEXT: v_mov_b32_e32 v17, s28 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 -; SDAG-NEXT: v_mov_b32_e32 v18, s2 -; SDAG-NEXT: v_mov_b32_e32 v19, s3 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s28 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s29 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; SDAG-NEXT: s_endpgm ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s24 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s27 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s28 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s29 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s0 +; GISEL-NEXT: v_mov_b32_e32 v33, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, -2 -; SDAG-NEXT: v_mov_b32_e32 v17, 33 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 33 -; GISEL-NEXT: v_mov_b32_e32 v17, -2 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) - ret <16 x float> %result +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, -2 -; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v17, -2 -; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b32_e32 v20, s0 +; GISEL-NEXT: v_mov_b32_e32 v21, s1 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, -2 -; SDAG-NEXT: v_mov_b32_e32 v17, 1.0 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v17, -2 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0.15915494 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 0.15915494 -; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d -; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) - ret <16 x float> %result -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_mov_b32_e32 v32, s0 -; GISEL-NEXT: v_mov_b32_e32 v33, s1 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) - store <16 x float> %result, ptr addrspace(1) %ptr, align 64 - ret void -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v32, -2 -; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v33, -2 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) - store <16 x float> %result, ptr addrspace(1) %ptr, align 64 - ret void -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s16 -; SDAG-NEXT: v_mov_b32_e32 v7, s17 -; SDAG-NEXT: v_mov_b32_e32 v8, s18 -; SDAG-NEXT: v_mov_b32_e32 v9, s19 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: v_mov_b32_e32 v14, s24 -; SDAG-NEXT: v_mov_b32_e32 v15, s25 -; SDAG-NEXT: v_mov_b32_e32 v16, s26 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: v_mov_b32_e32 v1, s1 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-NEXT: v_mov_b32_e32 v5, s1 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v0, 42 -; SDAG-NEXT: v_mov_b32_e32 v1, 25 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s16 -; SDAG-NEXT: v_mov_b32_e32 v7, s17 -; SDAG-NEXT: v_mov_b32_e32 v8, s18 -; SDAG-NEXT: v_mov_b32_e32 v9, s19 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v14, s24 -; SDAG-NEXT: v_mov_b32_e32 v15, s25 -; SDAG-NEXT: v_mov_b32_e32 v16, s26 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v4, 25 -; GISEL-NEXT: v_mov_b32_e32 v5, 42 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v44, s24 -; SDAG-NEXT: v_mov_b32_e32 v45, s25 -; SDAG-NEXT: v_mov_b32_e32 v46, s26 -; SDAG-NEXT: v_mov_b32_e32 v47, s27 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void -} - -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v32, 42 -; SDAG-NEXT: v_mov_b32_e32 v33, 25 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 25 -; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_endpgm - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) - store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 - store volatile <16 x float> %result, ptr addrspace(1) null, align 64 - ret void -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 1 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) - ret <16 x float> %result +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 1 +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, 42 +; SDAG-NEXT: v_mov_b32_e32 v1, 25 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_mov_b32_e32 v16, 1 -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b32_e32 v20, 25 +; GISEL-NEXT: v_mov_b32_e32 v21, 42 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) - ret <16 x float> %result +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void } -; -------------------------------------------------------------------- -; Incorrect signature for format cases (IR vector too large) -; -------------------------------------------------------------------- - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 14 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) - ret <16 x float> %result +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v32, 42 +; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b32_e32 v32, 25 +; GISEL-NEXT: v_mov_b32_e32 v33, 42 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 0, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_mov_b32_e32 v31, 1 +; SDAG-NEXT: v_mov_b32_e32 v32, 0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -7913,14 +5455,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0 +; GISEL-NEXT: v_mov_b32_e32 v32, 1 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -7936,8 +5478,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -7955,37 +5498,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 %scale0, i32 0, i32 %scale1) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_mov_b32_e32 v31, 0 +; SDAG-NEXT: v_mov_b32_e32 v32, 1 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8004,10 +5547,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 1 +; GISEL-NEXT: v_mov_b32_e32 v32, 0 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -8025,8 +5570,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8044,38 +5590,39 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 2, ; cbsz - i32 2, ; blgp - i32 0, i32 0, i32 0, i32 0) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8096,14 +5643,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -8119,7 +5666,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8141,36 +5688,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz - i32 4, ; blgp + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8191,14 +5738,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -8214,7 +5761,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8235,38 +5782,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz + i32 2, ; cbsz i32 0, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8285,31 +5832,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 15 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8327,37 +5874,86 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, - i32 0, ; cbsz - i32 4, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v14 -; SDAG-NEXT: scratch_load_dword v14, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v15 +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8378,29 +5974,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v14 -; GISEL-NEXT: scratch_load_dword v14, off, s32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v29 +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -8420,39 +6017,40 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, - i32 4, ; cbsz - i32 0, ; blgp + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 15 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8471,14 +6069,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 @@ -8494,8 +6092,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 15 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -8515,34 +6114,136 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz i32 4, ; blgp i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 15 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -8562,10 +6263,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -8583,7 +6286,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -8602,6 +6305,54 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz i32 4, ; blgp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index c2b7e51c43bc8..6eb9449069a52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -149,19 +149,19 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -247,151 +247,168 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x32_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -408,6 +425,104 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -664,37 +779,53 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, s0 -; GCN-NEXT: v_mov_b32_e32 v27, s1 -; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: v_mov_b32_e32 v16, v10 -; GCN-NEXT: v_mov_b32_e32 v15, v9 -; GCN-NEXT: v_mov_b32_e32 v14, v8 -; GCN-NEXT: v_mov_b32_e32 v13, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v11, v5 -; GCN-NEXT: v_mov_b32_e32 v10, v4 -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NEXT: v_mov_b32_e32 v4, s28 -; GCN-NEXT: v_mov_b32_e32 v5, s29 -; GCN-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NEXT: v_mov_b32_e32 v19, s17 -; GCN-NEXT: v_mov_b32_e32 v20, s18 -; GCN-NEXT: v_mov_b32_e32 v21, s19 -; GCN-NEXT: v_mov_b32_e32 v22, s20 -; GCN-NEXT: v_mov_b32_e32 v23, s21 -; GCN-NEXT: v_mov_b32_e32 v24, s22 -; GCN-NEXT: v_mov_b32_e32 v25, s23 +; GCN-NEXT: v_mov_b32_e32 v36, s0 +; GCN-NEXT: v_mov_b32_e32 v37, s1 +; GCN-NEXT: v_mov_b32_e32 v38, s2 +; GCN-NEXT: v_mov_b32_e32 v39, s3 +; GCN-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NEXT: v_mov_b32_e32 v16, s28 +; GCN-NEXT: v_mov_b32_e32 v17, s29 +; GCN-NEXT: v_mov_b32_e32 v28, s16 +; GCN-NEXT: v_mov_b32_e32 v29, s17 +; GCN-NEXT: v_mov_b32_e32 v30, s18 +; GCN-NEXT: v_mov_b32_e32 v31, s19 +; GCN-NEXT: v_mov_b32_e32 v32, s20 +; GCN-NEXT: v_mov_b32_e32 v33, s21 +; GCN-NEXT: v_mov_b32_e32 v34, s22 +; GCN-NEXT: v_mov_b32_e32 v35, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NEXT: v_mov_b32_e32 v20, v2 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v4 +; GCN-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v25, v7 +; GCN-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NEXT: v_mov_b32_e32 v27, v9 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 +; GCN-NEXT: s_nop 11 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -851,19 +982,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result @@ -955,151 +1086,44 @@ bb: } define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_32x32x64_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x i32> %result -} - -define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1116,19 +1140,241 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result } -; -------------------------------------------------------------------- -; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 -; -------------------------------------------------------------------- - -declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg) - -define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: +define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { +; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) + ret <16 x i32> %result +} + +define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) { +; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x i32> %result +} + +define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) + ret <16 x i32> %result +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 +; -------------------------------------------------------------------- + +declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg) + +define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 { +; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: ; SDAG: ; %bb.0: ; %bb ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -1272,19 +1518,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1441,19 +1687,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1610,19 +1856,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1779,19 +2025,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 ; GISEL-NEXT: v_mov_b32_e32 v2, s26 ; GISEL-NEXT: v_mov_b32_e32 v3, s27 -; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -1883,151 +2129,168 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2044,6 +2307,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2135,151 +2496,168 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2296,6 +2674,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2387,151 +2863,168 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2548,6 +3041,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -2639,151 +3230,168 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) { -; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <16 x float> %result -} - -define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s0 -; SDAG-NEXT: v_mov_b32_e32 v27, s1 -; SDAG-NEXT: v_mov_b32_e32 v28, s2 -; SDAG-NEXT: v_mov_b32_e32 v29, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 -; SDAG-NEXT: v_mov_b32_e32 v13, v7 -; SDAG-NEXT: v_mov_b32_e32 v12, v6 -; SDAG-NEXT: v_mov_b32_e32 v11, v5 -; SDAG-NEXT: v_mov_b32_e32 v10, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_mov_b32_e32 v22, s20 -; SDAG-NEXT: v_mov_b32_e32 v23, s21 -; SDAG-NEXT: v_mov_b32_e32 v24, s22 -; SDAG-NEXT: v_mov_b32_e32 v25, s23 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v12, s24 -; GISEL-NEXT: v_mov_b32_e32 v13, s25 -; GISEL-NEXT: v_mov_b32_e32 v14, s26 -; GISEL-NEXT: v_mov_b32_e32 v15, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10 -; GISEL-NEXT: s_nop 11 +; GISEL-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-NEXT: v_mov_b32_e32 v37, v11 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2800,6 +3408,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v13, v25 ; GISEL-NEXT: v_mov_b32_e32 v14, v26 ; GISEL-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <16 x float> %result +} + +define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index d3e171be10802..4366472c73a0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -246,6 +246,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 @@ -279,6 +280,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 @@ -418,6 +420,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 @@ -451,6 +454,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 5b72e006072df..0191a85b33888 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -193,8 +193,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -203,7 +202,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -229,8 +227,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -239,7 +236,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -343,8 +339,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -353,7 +348,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: @@ -379,8 +373,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: ; implicit-def: $vgpr7 ; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -389,7 +382,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 1c04ff3e83326..9dac2393fd966 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -85,7 +85,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt +; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -96,9 +96,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt +; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_load_store: @@ -115,7 +115,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt +; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -126,9 +126,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt +; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_load_store: @@ -413,7 +413,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -424,9 +424,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -443,7 +443,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -454,9 +454,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 3c4a29c54928d..9585c486aeb9e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX908 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90ADAG,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90AGSEL,GFX90A %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942DAG,GFX942 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942GSEL,GFX942 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) @@ -86,254 +86,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_vgpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 -; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: v_mov_b32_e32 v0, s16 -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, s17 -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, s18 -; GFX90ADAG-NEXT: v_mov_b32_e32 v3, s19 -; GFX90ADAG-NEXT: v_mov_b32_e32 v4, s20 -; GFX90ADAG-NEXT: v_mov_b32_e32 v5, s21 -; GFX90ADAG-NEXT: v_mov_b32_e32 v6, s22 -; GFX90ADAG-NEXT: v_mov_b32_e32 v7, s23 -; GFX90ADAG-NEXT: v_mov_b32_e32 v8, s24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v9, s25 -; GFX90ADAG-NEXT: v_mov_b32_e32 v10, s26 -; GFX90ADAG-NEXT: v_mov_b32_e32 v11, s27 -; GFX90ADAG-NEXT: v_mov_b32_e32 v12, s28 -; GFX90ADAG-NEXT: v_mov_b32_e32 v13, s29 -; GFX90ADAG-NEXT: v_mov_b32_e32 v14, s30 -; GFX90ADAG-NEXT: v_mov_b32_e32 v15, s31 -; GFX90ADAG-NEXT: v_mov_b32_e32 v16, s0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v17, s1 -; GFX90ADAG-NEXT: v_mov_b32_e32 v18, s2 -; GFX90ADAG-NEXT: v_mov_b32_e32 v19, s3 -; GFX90ADAG-NEXT: v_mov_b32_e32 v20, s4 -; GFX90ADAG-NEXT: v_mov_b32_e32 v21, s5 -; GFX90ADAG-NEXT: v_mov_b32_e32 v22, s6 -; GFX90ADAG-NEXT: v_mov_b32_e32 v23, s7 -; GFX90ADAG-NEXT: v_mov_b32_e32 v24, s8 -; GFX90ADAG-NEXT: v_mov_b32_e32 v25, s9 -; GFX90ADAG-NEXT: v_mov_b32_e32 v26, s10 -; GFX90ADAG-NEXT: v_mov_b32_e32 v27, s11 -; GFX90ADAG-NEXT: v_mov_b32_e32 v28, s12 -; GFX90ADAG-NEXT: v_mov_b32_e32 v29, s13 -; GFX90ADAG-NEXT: v_mov_b32_e32 v30, s14 -; GFX90ADAG-NEXT: v_mov_b32_e32 v31, s15 -; GFX90ADAG-NEXT: s_nop 1 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1] -; GFX90AGSEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1] -; GFX90AGSEL-NEXT: s_nop 1 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] -; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0 -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 1 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_vgpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 -; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: v_mov_b32_e32 v0, s16 -; GFX942DAG-NEXT: v_mov_b32_e32 v1, s17 -; GFX942DAG-NEXT: v_mov_b32_e32 v2, s18 -; GFX942DAG-NEXT: v_mov_b32_e32 v3, s19 -; GFX942DAG-NEXT: v_mov_b32_e32 v4, s20 -; GFX942DAG-NEXT: v_mov_b32_e32 v5, s21 -; GFX942DAG-NEXT: v_mov_b32_e32 v6, s22 -; GFX942DAG-NEXT: v_mov_b32_e32 v7, s23 -; GFX942DAG-NEXT: v_mov_b32_e32 v8, s24 -; GFX942DAG-NEXT: v_mov_b32_e32 v9, s25 -; GFX942DAG-NEXT: v_mov_b32_e32 v10, s26 -; GFX942DAG-NEXT: v_mov_b32_e32 v11, s27 -; GFX942DAG-NEXT: v_mov_b32_e32 v12, s28 -; GFX942DAG-NEXT: v_mov_b32_e32 v13, s29 -; GFX942DAG-NEXT: v_mov_b32_e32 v14, s30 -; GFX942DAG-NEXT: v_mov_b32_e32 v15, s31 -; GFX942DAG-NEXT: v_mov_b32_e32 v16, s0 -; GFX942DAG-NEXT: v_mov_b32_e32 v17, s1 -; GFX942DAG-NEXT: v_mov_b32_e32 v18, s2 -; GFX942DAG-NEXT: v_mov_b32_e32 v19, s3 -; GFX942DAG-NEXT: v_mov_b32_e32 v20, s4 -; GFX942DAG-NEXT: v_mov_b32_e32 v21, s5 -; GFX942DAG-NEXT: v_mov_b32_e32 v22, s6 -; GFX942DAG-NEXT: v_mov_b32_e32 v23, s7 -; GFX942DAG-NEXT: v_mov_b32_e32 v24, s8 -; GFX942DAG-NEXT: v_mov_b32_e32 v25, s9 -; GFX942DAG-NEXT: v_mov_b32_e32 v26, s10 -; GFX942DAG-NEXT: v_mov_b32_e32 v27, s11 -; GFX942DAG-NEXT: v_mov_b32_e32 v28, s12 -; GFX942DAG-NEXT: v_mov_b32_e32 v29, s13 -; GFX942DAG-NEXT: v_mov_b32_e32 v30, s14 -; GFX942DAG-NEXT: v_mov_b32_e32 v31, s15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 2.0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[24:25], s[24:25] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[26:27], s[26:27] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[28:29], s[28:29] -; GFX942GSEL-NEXT: v_mov_b64_e32 v[30:31], s[30:31] -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] -; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0 -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 0 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -420,286 +228,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_agpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 -; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX90ADAG-NEXT: s_nop 1 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_agpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s16 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s17 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s18 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s19 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s20 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s21 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s22 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s23 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s24 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s25 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s26 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s27 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s28 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s29 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s30 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s31 -; GFX90AGSEL-NEXT: s_nop 1 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 1 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_agpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 -; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_agpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s16 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s17 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s18 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s19 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s20 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s21 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s22 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s23 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s24 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s25 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s26 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s27 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s28 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s29 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s30 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s31 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 0 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -763,40 +347,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -805,134 +389,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX90ADAG-NEXT: ;;#ASMSTART -; GFX90ADAG-NEXT: ; def a0 -; GFX90ADAG-NEXT: ;;#ASMEND -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90AGSEL-NEXT: ;;#ASMSTART -; GFX90AGSEL-NEXT: ; def a0 -; GFX90AGSEL-NEXT: ;;#ASMEND -; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942DAG-NEXT: ;;#ASMSTART -; GFX942DAG-NEXT: ; def a0 -; GFX942DAG-NEXT: ;;#ASMEND -; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942GSEL-NEXT: ;;#ASMSTART -; GFX942GSEL-NEXT: ; def a0 -; GFX942GSEL-NEXT: ;;#ASMEND -; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -997,40 +453,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -1039,134 +495,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX90ADAG-NEXT: ;;#ASMSTART -; GFX90ADAG-NEXT: ; use a[100:131] -; GFX90ADAG-NEXT: ;;#ASMEND -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX90AGSEL-NEXT: ;;#ASMSTART -; GFX90AGSEL-NEXT: ; use a[100:131] -; GFX90AGSEL-NEXT: ;;#ASMEND -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942DAG-NEXT: ;;#ASMSTART -; GFX942DAG-NEXT: ; use a[100:131] -; GFX942DAG-NEXT: ;;#ASMEND -; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942GSEL-NEXT: ;;#ASMSTART -; GFX942GSEL-NEXT: ; use a[100:131] -; GFX942GSEL-NEXT: ;;#ASMEND -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -1231,40 +559,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 @@ -1273,134 +601,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0 -; GFX90ADAG-NEXT: ;;#ASMSTART -; GFX90ADAG-NEXT: ; def v0 -; GFX90ADAG-NEXT: ;;#ASMEND -; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90AGSEL-NEXT: ;;#ASMSTART -; GFX90AGSEL-NEXT: ; def v0 -; GFX90AGSEL-NEXT: ;;#ASMEND -; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GFX90AGSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0 -; GFX942DAG-NEXT: ;;#ASMSTART -; GFX942DAG-NEXT: ; def v0 -; GFX942DAG-NEXT: ;;#ASMEND -; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942GSEL-NEXT: ;;#ASMSTART -; GFX942GSEL-NEXT: ; def v0 -; GFX942GSEL-NEXT: ;;#ASMEND -; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GFX942GSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={v0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -1487,40 +687,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 ; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112 ; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64 ; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80 @@ -1529,205 +729,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35] ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90ADAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90ADAG-NEXT: s_mov_b32 s38, -1 -; GFX90ADAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90ADAG-NEXT: s_add_u32 s36, s36, s11 -; GFX90ADAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX90ADAG-NEXT: s_mov_b32 s12, s8 -; GFX90ADAG-NEXT: s_add_u32 s8, s4, 44 -; GFX90ADAG-NEXT: s_mov_b32 s13, s9 -; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0 -; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90ADAG-NEXT: s_getpc_b64 s[4:5] -; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX90ADAG-NEXT: s_mov_b32 s14, s10 -; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0 -; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90ADAG-NEXT: s_mov_b32 s32, 0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v40, 0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35] -; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: s_nop 0 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35] -; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90AGSEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90AGSEL-NEXT: s_mov_b32 s38, -1 -; GFX90AGSEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90AGSEL-NEXT: s_add_u32 s36, s36, s11 -; GFX90AGSEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX90AGSEL-NEXT: s_mov_b32 s16, s8 -; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 44 -; GFX90AGSEL-NEXT: s_mov_b32 s15, s9 -; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0 -; GFX90AGSEL-NEXT: s_mov_b64 s[12:13], s[0:1] -; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1] -; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 -; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 -; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 -; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90AGSEL-NEXT: s_mov_b32 s14, s10 -; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[12:13] -; GFX90AGSEL-NEXT: s_mov_b32 s12, s16 -; GFX90AGSEL-NEXT: s_mov_b32 s13, s15 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0 -; GFX90AGSEL-NEXT: s_mov_b32 s32, 0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35] -; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: s_nop 0 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_mov_b32 s12, s8 -; GFX942DAG-NEXT: s_add_u32 s8, s4, 44 -; GFX942DAG-NEXT: s_mov_b32 s13, s9 -; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0 -; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942DAG-NEXT: s_getpc_b64 s[4:5] -; GFX942DAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX942DAG-NEXT: s_mov_b32 s14, s10 -; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0 -; GFX942DAG-NEXT: s_mov_b32 s32, 0 -; GFX942DAG-NEXT: v_mov_b32_e32 v40, 0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35] -; GFX942DAG-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: s_nop 0 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35] -; GFX942DAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_mov_b32 s12, s8 -; GFX942GSEL-NEXT: s_add_u32 s8, s4, 44 -; GFX942GSEL-NEXT: s_mov_b32 s13, s9 -; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0 -; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942GSEL-NEXT: s_getpc_b64 s[4:5] -; GFX942GSEL-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX942GSEL-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX942GSEL-NEXT: s_mov_b32 s14, s10 -; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0 -; GFX942GSEL-NEXT: s_mov_b32 s32, 0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35] -; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112 -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: s_nop 0 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942GSEL-NEXT: s_endpgm bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -1830,59 +831,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16 ; GFX908-NEXT: s_cbranch_scc1 .LBB6_2 @@ -1905,331 +906,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: .LBB6_2: ; %bb3 ; GFX908-NEXT: s_endpgm -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: -; GFX90ADAG: ; %bb.0: ; %bb1 -; GFX90ADAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX90ADAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX90ADAG-NEXT: s_mov_b32 s54, -1 -; GFX90ADAG-NEXT: s_mov_b32 s55, 0xe00000 -; GFX90ADAG-NEXT: s_add_u32 s52, s52, s11 -; GFX90ADAG-NEXT: s_mov_b32 s14, s10 -; GFX90ADAG-NEXT: s_mov_b32 s12, s8 -; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX90ADAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90ADAG-NEXT: s_load_dword s8, s[4:5], 0x2c -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX90ADAG-NEXT: s_addc_u32 s53, s53, 0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 -; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 -; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX90ADAG-NEXT: s_bitcmp0_b32 s8, 0 -; GFX90ADAG-NEXT: s_mov_b32 s32, 0 -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s36 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s37 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s38 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s39 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s40 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s41 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s42 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s43 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s44 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s45 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s46 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s47 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s48 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s49 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s50 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s51 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s16 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s17 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s18 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s19 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s20 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s21 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s22 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s23 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s24 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s25 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s26 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s27 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s28 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s29 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s30 -; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s31 -; GFX90ADAG-NEXT: s_nop 1 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] -; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 -; GFX90ADAG-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX90ADAG-NEXT: ; %bb.1: ; %bb2 -; GFX90ADAG-NEXT: s_add_u32 s8, s4, 48 -; GFX90ADAG-NEXT: s_mov_b32 s13, s9 -; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0 -; GFX90ADAG-NEXT: s_getpc_b64 s[4:5] -; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0 -; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[54:55] -; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX90ADAG-NEXT: .LBB6_2: ; %bb3 -; GFX90ADAG-NEXT: s_endpgm -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: -; GFX90AGSEL: ; %bb.0: ; %bb1 -; GFX90AGSEL-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 -; GFX90AGSEL-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 -; GFX90AGSEL-NEXT: s_mov_b32 s70, -1 -; GFX90AGSEL-NEXT: s_mov_b32 s71, 0xe00000 -; GFX90AGSEL-NEXT: s_add_u32 s68, s68, s11 -; GFX90AGSEL-NEXT: s_mov_b32 s14, s10 -; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90AGSEL-NEXT: s_mov_b64 s[16:17], s[0:1] -; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90AGSEL-NEXT: s_load_dword s2, s[4:5], 0x2c -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 -; GFX90AGSEL-NEXT: s_load_dwordx16 s[52:67], s[0:1], 0x40 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90AGSEL-NEXT: s_addc_u32 s69, s69, 0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s52 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s53 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s54 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s55 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s56 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s57 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s58 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s59 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s60 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s61 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s62 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s63 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s64 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s65 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s66 -; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s67 -; GFX90AGSEL-NEXT: s_xor_b32 s2, s2, 1 -; GFX90AGSEL-NEXT: s_and_b32 s2, s2, 1 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX90AGSEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX90AGSEL-NEXT: s_mov_b32 s32, 0 -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[0:1] offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[0:1] offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[0:1] offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[0:1] offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[0:1] offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[0:1] offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[0:1] offset:112 -; GFX90AGSEL-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX90AGSEL-NEXT: ; %bb.1: ; %bb2 -; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1] -; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 -; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 -; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 -; GFX90AGSEL-NEXT: s_mov_b32 s12, s8 -; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 48 -; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[68:69] -; GFX90AGSEL-NEXT: s_mov_b32 s13, s9 -; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0 -; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[70:71] -; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[16:17] -; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0 -; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX90AGSEL-NEXT: .LBB6_2: ; %bb3 -; GFX90AGSEL-NEXT: s_endpgm -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: -; GFX942DAG: ; %bb.0: ; %bb1 -; GFX942DAG-NEXT: s_mov_b32 s14, s10 -; GFX942DAG-NEXT: s_mov_b32 s12, s8 -; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX942DAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942DAG-NEXT: s_load_dword s8, s[4:5], 0x2c -; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 -; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 -; GFX942DAG-NEXT: s_bitcmp0_b32 s8, 0 -; GFX942DAG-NEXT: s_mov_b32 s32, 0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s36 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s37 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s38 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s39 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s40 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s41 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s42 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s43 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s44 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s45 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s46 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s47 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s48 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s49 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s50 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s51 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s16 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s17 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s18 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s19 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s20 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s21 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s22 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s23 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s24 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s25 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s26 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s27 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s28 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s29 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s30 -; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s31 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] -; GFX942DAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 -; GFX942DAG-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX942DAG-NEXT: ; %bb.1: ; %bb2 -; GFX942DAG-NEXT: s_add_u32 s8, s4, 48 -; GFX942DAG-NEXT: s_mov_b32 s13, s9 -; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0 -; GFX942DAG-NEXT: s_getpc_b64 s[4:5] -; GFX942DAG-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0 -; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX942DAG-NEXT: .LBB6_2: ; %bb3 -; GFX942DAG-NEXT: s_endpgm -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: -; GFX942GSEL: ; %bb.0: ; %bb1 -; GFX942GSEL-NEXT: s_mov_b32 s14, s10 -; GFX942GSEL-NEXT: s_mov_b32 s12, s8 -; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX942GSEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942GSEL-NEXT: s_load_dword s8, s[4:5], 0x2c -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x0 -; GFX942GSEL-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x40 -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942GSEL-NEXT: s_xor_b32 s8, s8, 1 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s36 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s37 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s38 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s39 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s40 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s41 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s42 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s43 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s44 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s45 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s46 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s47 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s48 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s49 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s50 -; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s51 -; GFX942GSEL-NEXT: s_and_b32 s8, s8, 1 -; GFX942GSEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX942GSEL-NEXT: s_mov_b32 s32, 0 -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112 -; GFX942GSEL-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX942GSEL-NEXT: ; %bb.1: ; %bb2 -; GFX942GSEL-NEXT: s_getpc_b64 s[6:7] -; GFX942GSEL-NEXT: s_add_u32 s6, s6, foo@gotpcrel32@lo+4 -; GFX942GSEL-NEXT: s_addc_u32 s7, s7, foo@gotpcrel32@hi+12 -; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 -; GFX942GSEL-NEXT: s_add_u32 s8, s4, 48 -; GFX942GSEL-NEXT: s_mov_b32 s13, s9 -; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0 -; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0 -; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX942GSEL-NEXT: .LBB6_2: ; %bb3 -; GFX942GSEL-NEXT: s_endpgm bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -2296,40 +972,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 @@ -2339,122 +1015,6 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90ADAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v35, 2.0 -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: s_nop 0 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90AGSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX90AGSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v35, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: s_nop 0 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942DAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX942DAG-NEXT: v_mov_b32_e32 v34, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v35, 2.0 -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: s_nop 0 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942GSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX942GSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 -; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v35, 2.0 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: s_nop 0 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -2513,40 +1073,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 @@ -2556,122 +1116,6 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: -; GFX90ADAG: ; %bb.0: ; %bb -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 -; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 -; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 -; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 -; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 -; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 -; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 -; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off -; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: s_nop 0 -; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GFX90ADAG-NEXT: s_nop 15 -; GFX90ADAG-NEXT: s_nop 2 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off -; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 -; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) -; GFX90ADAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: -; GFX90AGSEL: ; %bb.0: ; %bb -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off -; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 -; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX90AGSEL-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: s_nop 0 -; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GFX90AGSEL-NEXT: s_nop 15 -; GFX90AGSEL-NEXT: s_nop 2 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 -; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 -; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) -; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: -; GFX942DAG: ; %bb.0: ; %bb -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 -; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 -; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 -; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 -; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 -; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 -; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 -; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off -; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: s_nop 0 -; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] -; GFX942DAG-NEXT: s_nop 15 -; GFX942DAG-NEXT: s_nop 1 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off -; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 -; GFX942DAG-NEXT: s_waitcnt vmcnt(0) -; GFX942DAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: -; GFX942GSEL: ; %bb.0: ; %bb -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off -; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16 -; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32 -; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48 -; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64 -; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80 -; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96 -; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112 -; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942GSEL-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: s_nop 0 -; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] -; GFX942GSEL-NEXT: s_nop 15 -; GFX942GSEL-NEXT: s_nop 1 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96 -; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112 -; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) -; GFX942GSEL-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -2686,5 +1130,5 @@ attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" } attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} ; GFX90A: {{.*}} -; GFX942: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index d444db8cd1bdf..0af655dfbbee9 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -54,64 +54,49 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: @@ -300,64 +285,49 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: @@ -542,69 +512,53 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 @@ -638,6 +592,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -663,7 +618,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 @@ -697,6 +651,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -819,64 +774,49 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: @@ -1079,179 +1019,133 @@ exit: define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_vgpr_init: ; GFX908: ; %bb.0: ; %entry +; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a9, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a15, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a21, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a27, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a30, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1276,42 +1170,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1435,105 +1329,91 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1560,41 +1440,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1715,72 +1596,60 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1810,11 +1679,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1840,9 +1707,12 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1872,11 +1742,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1967,64 +1835,49 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mfma_forward_init: @@ -2187,64 +2040,49 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 12 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_agpr_init: @@ -2481,64 +2319,49 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 -; GFX908-NEXT: s_nop 9 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: s_nop 10 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_nested_loop_zeroinit: @@ -2999,8 +2822,8 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: s_mov_b32 s4, 16 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3048,7 +2871,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 @@ -3082,6 +2904,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3129,7 +2952,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 @@ -3163,6 +2985,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 800eb9efa571e..51cd564bdece3 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -95,123 +95,123 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 ; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 ; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 +; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 +; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 ; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49 -; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GREEDY908-NEXT: s_endpgm @@ -499,73 +499,105 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a0, s36 -; FAST90A-NEXT: v_accvgpr_write_b32 a1, s37 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, s38 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, s39 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, s40 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, s41 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, s42 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, s43 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, s44 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, s45 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, s46 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, s47 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, s48 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, s49 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, s50 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, s51 -; FAST90A-NEXT: v_accvgpr_write_b32 a16, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a17, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a18, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a19, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a20, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a21, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a22, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a23, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a24, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a25, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a26, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a27, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a28, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a29, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a30, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a31, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 +; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 +; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 +; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 +; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 +; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 +; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 +; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 +; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 +; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 +; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 +; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 +; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 +; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 +; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 +; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 +; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[0:31] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] ; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32 -; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33 -; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34 -; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35 -; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36 -; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37 -; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38 -; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39 -; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40 -; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41 -; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42 -; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43 -; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44 -; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45 -; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46 -; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47 -; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48 -; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49 -; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50 -; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51 -; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52 -; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53 -; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54 -; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55 -; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56 -; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57 -; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58 -; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59 -; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60 -; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61 +; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 +; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 +; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 +; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 +; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 +; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 +; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 +; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 +; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 +; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 +; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 +; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 +; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 +; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 +; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 +; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 +; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 +; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 +; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 +; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 +; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 +; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 +; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 +; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 +; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 +; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 +; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 +; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 +; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 +; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 +; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 +; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 +; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 +; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 +; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 +; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 +; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 +; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 +; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 +; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 +; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 +; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 +; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 +; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 +; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 +; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 +; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 +; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 +; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 +; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 +; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 +; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 +; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 +; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 +; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 +; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 +; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 +; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 +; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 +; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 15 @@ -594,98 +626,82 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY908-NEXT: v_mov_b32_e32 v16, 0 +; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY908-NEXT: v_mov_b32_e32 v17, s0 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v17 -; GREEDY908-NEXT: v_mov_b32_e32 v17, s3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 -; GREEDY908-NEXT: v_mov_b32_e32 v17, s6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 -; GREEDY908-NEXT: v_mov_b32_e32 v17, s9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 -; GREEDY908-NEXT: v_mov_b32_e32 v17, s12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 -; GREEDY908-NEXT: v_mov_b32_e32 v17, s15 -; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] -; GREEDY908-NEXT: s_nop 9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a16 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a20 -; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a21 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a22 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a23 -; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a24 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a25 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a26 -; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a27 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a28 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a29 -; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY908-NEXT: s_nop 8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 -; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a2 -; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a3 -; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8 -; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10 -; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11 -; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4 -; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5 -; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6 -; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7 -; GREEDY908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 -; GREEDY908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 -; GREEDY908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GREEDY908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: @@ -693,51 +709,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] -; GREEDY90A-NEXT: s_nop 10 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a16 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a17 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a18 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a19 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a20 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a21 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a22 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a23 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a24 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a25 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a26 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a27 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a28 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY90A-NEXT: s_nop 9 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-NEXT: s_nop 9 -; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GREEDY90A-NEXT: s_nop 10 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: @@ -745,51 +749,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 +; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v0, v1, a[0:15] -; GREEDY942-NEXT: s_nop 9 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a17 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a18 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a19 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a20 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a21 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a22 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a23 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a24 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a25 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a26 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a27 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a28 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] +; GREEDY942-NEXT: s_nop 8 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GREEDY942-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY942-NEXT: s_nop 8 -; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GREEDY942-NEXT: s_nop 9 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: @@ -847,8 +839,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) @@ -869,8 +862,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 ; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] ; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 @@ -887,9 +880,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; FAST90A-NEXT: v_mov_b32_e32 v0, 0 -; FAST90A-NEXT: s_nop 9 +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index c77042d0c96c3..cf244f0b1f884 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: s_mov_b32 s6, 0 +; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -18,33 +18,34 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX942-NEXT: s_or_b32 s4, s3, 1 +; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_or_b32 s4, s6, 1 -; GFX942-NEXT: s_ashr_i32 s3, s6, 31 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3] -; GFX942-NEXT: s_and_b32 s6, s3, s4 -; GFX942-NEXT: s_nop 5 -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: s_and_b32 s3, s5, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr6 +; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: ; implicit-def: $agpr2 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: s_mov_b32 s6, 0 +; GFX908-NEXT: s_mov_b32 s3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -53,28 +54,28 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX908-NEXT: s_or_b32 s4, s3, 1 +; GFX908-NEXT: s_ashr_i32 s5, s3, 31 ; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: v_mov_b32_e32 v0, s2 -; GFX908-NEXT: v_mov_b32_e32 v1, s3 +; GFX908-NEXT: v_mov_b32_e32 v1, s2 +; GFX908-NEXT: s_nop 2 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a1 -; GFX908-NEXT: s_or_b32 s4, s6, 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX908-NEXT: s_ashr_i32 s3, s6, 31 -; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3] -; GFX908-NEXT: s_and_b32 s6, s3, s4 -; GFX908-NEXT: s_nop 8 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr6 +; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: ; implicit-def: $agpr2 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir index 29f44282f06fc..01506d0af1913 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -83,12 +83,13 @@ body: | ; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -101,12 +102,12 @@ body: | ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]] - ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1 - ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1 - ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 - ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: ; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) @@ -136,12 +137,13 @@ body: | ; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -154,12 +156,12 @@ body: | ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]] - ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1 - ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1 - ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 - ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: ; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir index 17458fa8b08a7..a9207de317ea1 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir @@ -74,7 +74,7 @@ body: | ; COALESCE-NEXT: successors: %bb.3(0x80000000) ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec @@ -116,7 +116,7 @@ body: | ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0 ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 110604a7cd88e..f4a9e7e8f2759 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -521,8 +521,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 -; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s14, s22 +; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index 4e6b9166b3ed0..fc154604b8700 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -27,6 +27,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] +; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: s_branch .LBB0_4 ; CHECK-NEXT: .LBB0_2: @@ -46,6 +47,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31] +; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; CHECK-NEXT: .LBB0_4: ; %endif ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:31] diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index ecada6b300aa1..b9e9893ede4e2 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -378,66 +378,73 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7] +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; CHECK-NEXT: v_mov_b32_e32 v5, v4 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a2 +; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22 +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19] +; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0 +; CHECK-NEXT: global_store_short v[20:21], v23, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7] +; CHECK-NEXT: global_store_short v[20:21], v15, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 -; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: global_store_short v[20:21], v14, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16 ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: global_store_short v[20:21], v14, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] -; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: global_store_short v[20:21], v12, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: global_store_short v[12:13], v0, off +; CHECK-NEXT: global_store_short v[20:21], v0, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -812,32 +819,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_ ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1 -; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2 -; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3 -; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4 -; CHECK-NEXT: v_accvgpr_mov_b32 a4, a5 -; CHECK-NEXT: v_accvgpr_mov_b32 a5, a6 -; CHECK-NEXT: v_accvgpr_mov_b32 a6, a7 -; CHECK-NEXT: v_accvgpr_mov_b32 a7, a8 -; CHECK-NEXT: v_accvgpr_mov_b32 a8, a9 -; CHECK-NEXT: v_accvgpr_mov_b32 a9, a10 -; CHECK-NEXT: v_accvgpr_mov_b32 a10, a11 -; CHECK-NEXT: v_accvgpr_mov_b32 a11, a12 -; CHECK-NEXT: v_accvgpr_mov_b32 a12, a13 -; CHECK-NEXT: v_accvgpr_mov_b32 a13, a14 -; CHECK-NEXT: v_accvgpr_mov_b32 a14, a15 -; CHECK-NEXT: v_accvgpr_mov_b32 a15, a16 +; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16 +; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15 +; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14 +; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13 +; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12 +; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11 +; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10 +; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9 +; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8 +; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7 +; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6 +; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5 +; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index 3ee558d6f8a9e..4d864ad15b411 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -576,9 +576,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a4, a5 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a5 -; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a4 +; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a5 +; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -590,9 +590,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX940-NEXT: ; def a4, a5 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX940-NEXT: v_accvgpr_read_b32 v2, a5 -; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX940-NEXT: v_accvgpr_mov_b32 a1, a4 +; GFX940-NEXT: v_accvgpr_mov_b32 a0, a5 +; GFX940-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 50cdf11eea2f7..34043cd067b25 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -413,27 +413,25 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -674,27 +672,25 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index a6a84c780cb32..f65340470feb1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -413,27 +413,25 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -674,27 +672,25 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll index 0b20caea9cd95..51dc9a51ec9d0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -478,31 +474,27 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll index 2ecbf9622a259..7f8f2dbbb09a1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -478,31 +474,27 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index bacec04ab7600..13e3d94c35446 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -413,27 +413,25 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -674,27 +672,25 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index fb71492fb867d..430f64164d24f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -469,29 +469,27 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -563,27 +561,26 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -654,29 +651,27 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -701,27 +696,26 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -750,35 +744,32 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -808,33 +799,30 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -864,35 +852,32 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -922,35 +907,33 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1262,35 +1245,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1320,35 +1302,34 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1478,29 +1459,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1527,27 +1506,25 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1576,35 +1553,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1880,27 +1856,26 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1930,35 +1905,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2039,29 +2013,28 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index 1ab87d6f19ec4..ef670e963bdb6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -416,27 +416,25 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -961,29 +959,28 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1010,29 +1007,27 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1400,14 +1395,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1417,14 +1411,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2015,14 +2008,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,14 +2024,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2868,29 +2859,28 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2918,29 +2908,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3019,14 +3007,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3036,14 +3023,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3181,29 +3167,27 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3626,14 +3610,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3643,14 +3626,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3791,29 +3773,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll index c5a08f098b4c6..50c69de069986 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -965,29 +965,26 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1014,29 +1011,26 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1247,29 +1241,28 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1296,29 +1289,26 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1345,29 +1335,28 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1849,14 +1838,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1866,14 +1855,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2678,14 +2667,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2695,14 +2684,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4675,29 +4664,28 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4725,29 +4713,26 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4775,29 +4760,28 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4824,27 +4808,26 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4878,14 +4861,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4895,14 +4878,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5097,29 +5081,28 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5146,29 +5129,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5692,14 +5675,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5709,14 +5692,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5915,29 +5899,28 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5964,29 +5947,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6723,29 +6706,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7501,29 +7484,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index 91790ab5ff97f..ea4fac3b1d2b1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -469,29 +469,27 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -563,27 +561,26 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -654,29 +651,27 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -701,27 +696,26 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -750,35 +744,32 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -808,33 +799,30 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -864,35 +852,32 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -922,35 +907,33 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1262,35 +1245,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1320,35 +1302,34 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1478,29 +1459,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1527,27 +1506,25 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1576,35 +1553,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1880,27 +1856,26 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1930,35 +1905,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2039,29 +2013,28 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index db780ced25148..7061c13b28d03 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -416,27 +416,25 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -961,29 +959,28 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1010,29 +1007,27 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1400,14 +1395,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1417,14 +1411,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2015,14 +2008,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,14 +2024,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2868,29 +2859,28 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2918,29 +2908,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3019,14 +3007,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3036,14 +3023,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3181,29 +3167,27 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3626,14 +3610,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3643,14 +3626,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3791,29 +3773,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll index 92d6c95c26599..11d1897d0449f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -965,29 +965,26 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1014,29 +1011,26 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1247,29 +1241,28 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1296,29 +1289,26 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1345,29 +1335,28 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1849,14 +1838,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1866,14 +1855,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2678,14 +2667,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2695,14 +2684,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4675,29 +4664,28 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4725,29 +4713,26 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4775,29 +4760,28 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4824,27 +4808,26 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4878,14 +4861,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4895,14 +4878,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5097,29 +5081,28 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5146,29 +5129,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5692,14 +5675,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5709,14 +5692,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5915,29 +5899,28 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5964,29 +5947,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6723,29 +6706,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7501,29 +7484,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll index bbca5039bb02c..a15fc3212f474 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -699,32 +695,28 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1563,32 +1555,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2160,32 +2148,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll index 8757639c501d2..fe132493ce536 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -699,32 +695,28 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1563,32 +1555,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2160,32 +2148,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index 6d294b58ceeec..bd0100a4ffdb5 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -170,15 +170,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +186,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -273,27 +273,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -469,29 +469,27 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -563,27 +561,26 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -654,29 +651,27 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -701,27 +696,26 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -750,35 +744,32 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -808,33 +799,30 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -864,35 +852,32 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -922,35 +907,33 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1262,35 +1245,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1320,35 +1302,34 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1478,29 +1459,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1527,27 +1506,25 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1576,35 +1553,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1880,27 +1856,26 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1930,35 +1905,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2039,29 +2013,28 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index 88d43df5938ee..cecd2a0e4b015 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -416,27 +416,25 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -961,29 +959,28 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1010,29 +1007,27 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1400,14 +1395,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1417,14 +1411,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2015,14 +2008,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,14 +2024,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2868,29 +2859,28 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2918,29 +2908,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3019,14 +3007,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3036,14 +3023,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3181,29 +3167,27 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3626,14 +3610,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3643,14 +3626,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3791,29 +3773,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll index c9f194d873e35..834f03f013ba1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -965,29 +965,26 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1014,29 +1011,26 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1247,29 +1241,28 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1296,29 +1289,26 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1345,29 +1335,28 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1849,14 +1838,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1866,14 +1855,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2678,14 +2667,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2695,14 +2684,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4675,29 +4664,28 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4725,29 +4713,26 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4775,29 +4760,28 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4824,27 +4808,26 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4878,14 +4861,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4895,14 +4878,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5097,29 +5081,28 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5146,29 +5129,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5692,14 +5675,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5709,14 +5692,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5915,29 +5899,28 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5964,29 +5947,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6723,29 +6706,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7501,29 +7484,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index c7092f04a23ed..df148f299a165 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2380,29 +2380,28 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 1224ab2b381c9..d4ee6fa20cad8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,16 +271,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -414,27 +413,27 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -554,16 +553,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -571,17 +569,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -612,16 +609,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +626,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -722,29 +719,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -771,29 +766,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1037,31 +1031,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1089,31 +1083,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1189,29 +1180,28 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1335,31 +1325,31 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1385,29 +1375,28 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1499,15 +1488,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1517,15 +1506,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1556,33 +1545,34 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1616,15 +1606,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,15 +1624,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1674,17 +1664,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1682,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1734,17 +1724,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1742,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1853,15 +1843,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1871,15 +1861,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1911,16 +1901,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1918,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2028,17 +2018,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2046,17 +2036,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2088,17 +2078,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,17 +2096,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2613,17 +2603,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2620,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2673,17 +2661,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2678,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2733,17 +2720,16 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2737,16 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2849,17 +2834,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2851,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2909,16 +2893,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2926,16 +2909,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2966,17 +2949,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2984,17 +2965,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3026,17 +3006,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3044,17 +3023,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3086,17 +3065,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +3082,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3146,17 +3124,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3141,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3607,17 +3584,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3625,17 +3602,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3894,16 +3871,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,16 +3888,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3952,17 +3929,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3970,17 +3946,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4070,37 +4045,36 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4384,31 +4358,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4436,27 +4410,29 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4487,17 +4463,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4505,17 +4481,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4604,35 +4580,36 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4660,31 +4637,31 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4712,31 +4689,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4764,29 +4741,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4817,17 +4793,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4835,17 +4810,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5401,27 +5376,28 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5452,17 +5428,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5470,17 +5446,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5567,35 +5543,36 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5622,31 +5599,28 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5674,31 +5648,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5726,29 +5697,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5780,17 +5749,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5798,17 +5766,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5840,17 +5808,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5858,17 +5826,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5954,31 +5922,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6544,31 +6508,30 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6641,29 +6604,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6695,17 +6657,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6713,17 +6674,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6869,31 +6830,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index d5bd41397c4f0..edc540edb3ad1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -963,29 +963,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1012,29 +1009,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1446,31 +1440,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1740,31 +1734,28 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2794,15 +2785,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2802,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4149,15 +4138,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4167,15 +4155,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4209,15 +4196,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4227,15 +4213,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5463,37 +5448,34 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7190,31 +7172,28 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7292,29 +7271,28 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7581,31 +7559,30 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7683,31 +7660,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7789,15 +7763,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7807,15 +7780,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8069,31 +8041,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8974,31 +8946,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9078,15 +9047,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9096,15 +9064,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9138,15 +9105,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9156,15 +9122,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9312,31 +9277,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9365,31 +9327,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10357,15 +10319,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10375,15 +10337,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10591,31 +10553,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11815,31 +11777,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11868,31 +11830,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 03503c9dac197..9d3affa6da266 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2386,29 +2386,28 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 0222f73fbd193..1a669adf2b635 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,16 +271,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -414,27 +413,27 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -554,16 +553,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -571,17 +569,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -612,16 +609,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +626,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -722,29 +719,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -771,29 +766,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1037,31 +1031,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1089,31 +1083,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1189,29 +1180,28 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1335,31 +1325,31 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1385,29 +1375,28 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1499,15 +1488,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1517,15 +1506,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1556,33 +1545,34 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1616,15 +1606,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,15 +1624,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1674,17 +1664,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1682,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1734,17 +1724,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1742,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1853,15 +1843,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1871,15 +1861,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1911,16 +1901,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1918,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2028,17 +2018,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2046,17 +2036,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2088,17 +2078,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,17 +2096,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2613,17 +2603,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2620,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2673,17 +2661,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2678,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2733,17 +2720,16 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2737,16 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2849,17 +2834,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2851,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2909,16 +2893,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2926,16 +2909,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2966,17 +2949,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2984,17 +2965,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3026,17 +3006,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3044,17 +3023,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3086,17 +3065,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +3082,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3146,17 +3124,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3141,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3607,17 +3584,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3625,17 +3602,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3894,16 +3871,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,16 +3888,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3952,17 +3929,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3970,17 +3946,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4070,37 +4045,36 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4384,31 +4358,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4436,27 +4410,29 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4487,17 +4463,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4505,17 +4481,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4604,35 +4580,36 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4660,31 +4637,31 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4712,31 +4689,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4764,29 +4741,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4817,17 +4793,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4835,17 +4810,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5401,27 +5376,28 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5452,17 +5428,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5470,17 +5446,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5567,35 +5543,36 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5622,31 +5599,28 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5674,31 +5648,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5726,29 +5697,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5780,17 +5749,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5798,17 +5766,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5840,17 +5808,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5858,17 +5826,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5954,31 +5922,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6544,31 +6508,30 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6641,29 +6604,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6695,17 +6657,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6713,17 +6674,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6869,31 +6830,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index ee2f94b90ffa9..983afa566e2c1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -963,29 +963,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1012,29 +1009,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1446,31 +1440,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1740,31 +1734,28 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2794,15 +2785,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2802,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4149,15 +4138,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4167,15 +4155,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4209,15 +4196,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4227,15 +4213,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5463,37 +5448,34 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7190,31 +7172,28 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7292,29 +7271,28 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7581,31 +7559,30 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7683,31 +7660,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7789,15 +7763,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7807,15 +7780,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8069,31 +8041,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8974,31 +8946,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9078,15 +9047,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9096,15 +9064,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9138,15 +9105,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9156,15 +9122,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9312,31 +9277,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9365,31 +9327,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10357,15 +10319,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10375,15 +10337,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10591,31 +10553,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11815,31 +11777,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11868,31 +11830,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll index 21ec9acf6317d..ac7d9557ce765 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -628,15 +624,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -646,18 +642,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -756,15 +752,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -774,15 +770,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -957,39 +953,33 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1216,18 +1206,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,18 +1227,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1566,15 +1556,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1584,18 +1574,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1673,33 +1663,33 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1971,17 +1961,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v12, v2 -; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1994,17 +1984,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v12, v2 -; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2457,39 +2447,33 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2524,15 +2508,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2542,15 +2526,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2593,17 +2577,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,21 +2597,21 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2665,18 +2649,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2686,19 +2670,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2733,15 +2717,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,15 +2735,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2792,13 +2776,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2808,13 +2792,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3150,33 +3134,33 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3390,39 +3374,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index 615b382aa355a..8dd4a40d00680 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1388,15 +1388,13 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1406,15 +1404,13 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3641,33 +3637,33 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4791,15 +4787,13 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4809,15 +4803,13 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5177,15 +5169,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5195,15 +5187,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5540,15 +5532,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5558,15 +5550,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6263,17 +6255,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6283,17 +6275,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6978,33 +6970,33 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7352,15 +7344,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7370,15 +7362,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index 32f6e00716e37..ea9ef2f1ac94a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll index ee3b303f88471..b30af835a7882 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -291,31 +291,27 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -628,15 +624,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -646,18 +642,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -756,15 +752,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -774,15 +770,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -957,39 +953,33 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1216,18 +1206,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,18 +1227,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1566,15 +1556,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1584,18 +1574,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1673,33 +1663,33 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1971,17 +1961,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v12, v2 -; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1994,17 +1984,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v12, v2 -; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2457,39 +2447,33 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2524,15 +2508,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2542,15 +2526,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2593,17 +2577,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,21 +2597,21 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2665,18 +2649,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2686,19 +2670,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2733,15 +2717,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,15 +2735,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2792,13 +2776,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2808,13 +2792,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3150,33 +3134,33 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3390,39 +3374,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index 09e497259766e..e6ac554735eee 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1388,15 +1388,13 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1406,15 +1404,13 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3641,33 +3637,33 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4791,15 +4787,13 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4809,15 +4803,13 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5177,15 +5169,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5195,15 +5187,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5540,15 +5532,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5558,15 +5550,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6263,17 +6255,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6283,17 +6275,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6978,33 +6970,33 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7352,15 +7344,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7370,15 +7362,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index 257af574366a6..ce1c54129f706 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 90a1b99dc7c14..3b5690562c38a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2386,29 +2386,28 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index d13d26f638e0c..8039e126590b9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,16 +271,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -414,27 +413,27 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -554,16 +553,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -571,17 +569,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -612,16 +609,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +626,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -722,29 +719,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -771,29 +766,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1037,31 +1031,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1089,31 +1083,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1189,29 +1180,28 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1335,31 +1325,31 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1385,29 +1375,28 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1499,15 +1488,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1517,15 +1506,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1556,33 +1545,34 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1616,15 +1606,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,15 +1624,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1674,17 +1664,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1682,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1734,17 +1724,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1742,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1853,15 +1843,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1871,15 +1861,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1911,16 +1901,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1918,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2028,17 +2018,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2046,17 +2036,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2088,17 +2078,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,17 +2096,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2613,17 +2603,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2620,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2673,17 +2661,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2678,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2733,17 +2720,16 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2737,16 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2849,17 +2834,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2851,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2909,16 +2893,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2926,16 +2909,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2966,17 +2949,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2984,17 +2965,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3026,17 +3006,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3044,17 +3023,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3086,17 +3065,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +3082,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3146,17 +3124,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3141,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3607,17 +3584,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3625,17 +3602,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3894,16 +3871,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,16 +3888,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3952,17 +3929,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3970,17 +3946,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4070,37 +4045,36 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4384,31 +4358,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4436,27 +4410,29 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4487,17 +4463,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4505,17 +4481,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4604,35 +4580,36 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4660,31 +4637,31 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4712,31 +4689,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4764,29 +4741,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4817,17 +4793,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4835,17 +4810,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5401,27 +5376,28 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5452,17 +5428,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5470,17 +5446,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5567,35 +5543,36 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5622,31 +5599,28 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5674,31 +5648,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5726,29 +5697,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5780,17 +5749,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5798,17 +5766,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5840,17 +5808,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5858,17 +5826,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5954,31 +5922,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6544,31 +6508,30 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6641,29 +6604,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6695,17 +6657,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6713,17 +6674,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6869,31 +6830,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index 1684b94cfd452..eeab42ae40d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -963,29 +963,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1012,29 +1009,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1446,31 +1440,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1740,31 +1734,28 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2794,15 +2785,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2802,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4149,15 +4138,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4167,15 +4155,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4209,15 +4196,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4227,15 +4213,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5463,37 +5448,34 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7190,31 +7172,28 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7292,29 +7271,28 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7581,31 +7559,30 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7683,31 +7660,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7789,15 +7763,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7807,15 +7780,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8069,31 +8041,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8974,31 +8946,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9078,15 +9047,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9096,15 +9064,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9138,15 +9105,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9156,15 +9122,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9312,31 +9277,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9365,31 +9327,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10357,15 +10319,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10375,15 +10337,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10591,31 +10553,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11815,31 +11777,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11868,31 +11830,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index 96b18593ea655..d2008be4fd32a 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -16,18 +16,19 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s68, -1 -; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s5, s9, 0 +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_cselect_b32 s6, s68, 0 -; CHECK-NEXT: s_add_u32 s50, s34, 48 ; CHECK-NEXT: v_mov_b32_e32 v57, s5 ; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_add_u32 s50, s34, 48 +; CHECK-NEXT: v_accvgpr_write_b32 a33, s5 ; CHECK-NEXT: s_addc_u32 s51, s35, 0 -; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1] +; CHECK-NEXT: v_accvgpr_write_b32 a32, s4 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 @@ -47,13 +48,13 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s52, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: v_mov_b32_e32 v60, s66 -; CHECK-NEXT: v_mov_b32_e32 v61, s67 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: v_mov_b32_e32 v62, s66 +; CHECK-NEXT: v_mov_b32_e32 v63, s67 +; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[58:59] +; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59] ; CHECK-NEXT: v_mov_b32_e32 v44, 0 ; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -65,7 +66,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45] -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] @@ -74,9 +75,9 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: v_mov_b32_e32 v1, s67 ; CHECK-NEXT: v_mov_b32_e32 v0, s68 ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 644705e173b52..b045c761436de 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-LABEL: v8i8_multi_block: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1 +; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[6:7], v2, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_4 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v1 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_3 ; GFX942-NEXT: ; %bb.2: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: global_store_dwordx2 v1, v[6:7], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13] ; GFX942-NEXT: .LBB11_3: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: .LBB11_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() From 4dfc7ab072a2be65797d01e4fef7ced42ba96e5d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 8 Oct 2025 15:59:23 -0700 Subject: [PATCH 4/5] Control with flag Change-Id: I45128d10724a59687edda05a6fcd37302bfe7e6d --- .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 11 +- llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll | 499 +++++++++++++++++- 2 files changed, 508 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp index 427922481ecca..b7dbee9c32130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -27,6 +27,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" +static cl::opt InflateToAVClass( + "amdgpu-avgpr-inflation", cl::Hidden, + cl::desc("Whether to inflate register to the avgpr register " + "class -- which is assignable to either vgpr or agpr."), + cl::init(false)); + namespace { class AMDGPUPrepareAGPRAllocImpl { @@ -122,6 +128,9 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { continue; } + if (!InflateToAVClass) + continue; + for (MachineOperand &Op : MI.operands()) { if (!Op.isReg() || !Op.isDef()) continue; @@ -132,7 +141,7 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { const TargetRegisterClass *RC = MRI.getRegClass(DefReg); - if (TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC)) + if (TRI->hasVectorRegisters(RC)) Changed |= MRI.recomputeRegClass(DefReg); } } diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll index 3a534149121fb..bf4bf25e6b02a 100644 --- a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 --amdgpu-avgpr-inflation < %s | FileCheck -check-prefixes=INFLATE %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck -check-prefixes=GCN %s define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 { ; CHECK-LABEL: bad_rp: @@ -119,6 +120,331 @@ define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, ; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496 ; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480 ; CHECK-NEXT: s_endpgm +; INFLATE-LABEL: bad_rp: +; INFLATE: ; %bb.0: +; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x0 +; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x10 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b32_e32 v0, s0 +; INFLATE-NEXT: ds_read_b128 a[0:3], v0 +; INFLATE-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; INFLATE-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; INFLATE-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; INFLATE-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; INFLATE-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; INFLATE-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; INFLATE-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; INFLATE-NEXT: ds_read_b128 a[32:35], v0 offset:128 +; INFLATE-NEXT: ds_read_b128 a[36:39], v0 offset:144 +; INFLATE-NEXT: ds_read_b128 a[40:43], v0 offset:160 +; INFLATE-NEXT: ds_read_b128 a[44:47], v0 offset:176 +; INFLATE-NEXT: ds_read_b128 a[48:51], v0 offset:192 +; INFLATE-NEXT: ds_read_b128 a[52:55], v0 offset:208 +; INFLATE-NEXT: ds_read_b128 a[56:59], v0 offset:224 +; INFLATE-NEXT: ds_read_b128 a[60:63], v0 offset:240 +; INFLATE-NEXT: s_bitcmp1_b32 s1, 0 +; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0 +; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; INFLATE-NEXT: .LBB0_1: ; %bb.1 +; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1 +; INFLATE-NEXT: s_waitcnt lgkmcnt(14) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0 +; INFLATE-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255] +; INFLATE-NEXT: s_waitcnt lgkmcnt(13) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239] +; INFLATE-NEXT: s_waitcnt lgkmcnt(12) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223] +; INFLATE-NEXT: s_waitcnt lgkmcnt(11) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207] +; INFLATE-NEXT: s_waitcnt lgkmcnt(10) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191] +; INFLATE-NEXT: s_waitcnt lgkmcnt(9) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175] +; INFLATE-NEXT: s_waitcnt lgkmcnt(8) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159] +; INFLATE-NEXT: s_waitcnt lgkmcnt(7) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143] +; INFLATE-NEXT: s_waitcnt lgkmcnt(6) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127] +; INFLATE-NEXT: s_waitcnt lgkmcnt(5) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111] +; INFLATE-NEXT: s_waitcnt lgkmcnt(4) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95] +; INFLATE-NEXT: s_waitcnt lgkmcnt(3) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79] +; INFLATE-NEXT: s_waitcnt lgkmcnt(2) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63] +; INFLATE-NEXT: s_waitcnt lgkmcnt(1) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47] +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31] +; INFLATE-NEXT: s_cbranch_vccnz .LBB0_1 +; INFLATE-NEXT: ; %bb.2: ; %bb.2 +; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b64_e32 v[168:169], s[0:1] +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[240:243] +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480 +; INFLATE-NEXT: s_endpgm +; +; GCN-LABEL: bad_rp: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s1, s[4:5], 0x10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: ds_read_b128 a[48:51], v12 +; GCN-NEXT: ds_read_b128 a[52:55], v12 offset:16 +; GCN-NEXT: ds_read_b128 a[56:59], v12 offset:32 +; GCN-NEXT: ds_read_b128 a[60:63], v12 offset:48 +; GCN-NEXT: ds_read_b128 a[64:67], v12 offset:64 +; GCN-NEXT: ds_read_b128 a[68:71], v12 offset:80 +; GCN-NEXT: ds_read_b128 a[72:75], v12 offset:96 +; GCN-NEXT: ds_read_b128 a[76:79], v12 offset:112 +; GCN-NEXT: ds_read_b128 v[0:3], v12 offset:128 +; GCN-NEXT: ds_read_b128 v[4:7], v12 offset:144 +; GCN-NEXT: ds_read_b128 v[8:11], v12 offset:160 +; GCN-NEXT: ds_read_b128 v[32:35], v12 offset:176 +; GCN-NEXT: ds_read_b128 v[36:39], v12 offset:192 +; GCN-NEXT: ds_read_b128 v[40:43], v12 offset:208 +; GCN-NEXT: ds_read_b128 v[44:47], v12 offset:224 +; GCN-NEXT: ds_read_b128 v[12:15], v12 offset:240 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GCN-NEXT: .LBB0_1: ; %bb.1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: s_nop 9 +; GCN-NEXT: v_accvgpr_read_b32 v16, a48 +; GCN-NEXT: v_accvgpr_read_b32 v17, a49 +; GCN-NEXT: v_accvgpr_read_b32 v18, a50 +; GCN-NEXT: v_accvgpr_read_b32 v19, a51 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], v[16:19], v[16:19], 0 +; GCN-NEXT: v_accvgpr_read_b32 v16, a52 +; GCN-NEXT: v_accvgpr_read_b32 v17, a53 +; GCN-NEXT: v_accvgpr_read_b32 v18, a54 +; GCN-NEXT: v_accvgpr_read_b32 v19, a55 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], v[16:19], v[16:19], v[240:255] +; GCN-NEXT: s_waitcnt lgkmcnt(13) +; GCN-NEXT: v_accvgpr_read_b32 v16, a56 +; GCN-NEXT: v_accvgpr_read_b32 v17, a57 +; GCN-NEXT: v_accvgpr_read_b32 v18, a58 +; GCN-NEXT: v_accvgpr_read_b32 v19, a59 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], v[16:19], v[16:19], v[224:239] +; GCN-NEXT: s_waitcnt lgkmcnt(12) +; GCN-NEXT: v_accvgpr_read_b32 v16, a60 +; GCN-NEXT: v_accvgpr_read_b32 v17, a61 +; GCN-NEXT: v_accvgpr_read_b32 v18, a62 +; GCN-NEXT: v_accvgpr_read_b32 v19, a63 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], v[16:19], v[16:19], v[208:223] +; GCN-NEXT: s_waitcnt lgkmcnt(11) +; GCN-NEXT: v_accvgpr_read_b32 v16, a64 +; GCN-NEXT: v_accvgpr_read_b32 v17, a65 +; GCN-NEXT: v_accvgpr_read_b32 v18, a66 +; GCN-NEXT: v_accvgpr_read_b32 v19, a67 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], v[16:19], v[16:19], v[192:207] +; GCN-NEXT: s_waitcnt lgkmcnt(10) +; GCN-NEXT: v_accvgpr_read_b32 v16, a68 +; GCN-NEXT: v_accvgpr_read_b32 v17, a69 +; GCN-NEXT: v_accvgpr_read_b32 v18, a70 +; GCN-NEXT: v_accvgpr_read_b32 v19, a71 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[16:19], v[16:19], v[176:191] +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_accvgpr_read_b32 v16, a72 +; GCN-NEXT: v_accvgpr_read_b32 v17, a73 +; GCN-NEXT: v_accvgpr_read_b32 v18, a74 +; GCN-NEXT: v_accvgpr_read_b32 v19, a75 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[16:19], v[16:19], v[160:175] +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_accvgpr_read_b32 v16, a76 +; GCN-NEXT: v_accvgpr_read_b32 v17, a77 +; GCN-NEXT: v_accvgpr_read_b32 v18, a78 +; GCN-NEXT: v_accvgpr_read_b32 v19, a79 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[16:19], v[16:19], v[144:159] +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[0:3], v[0:3], v[128:143] +; GCN-NEXT: s_waitcnt lgkmcnt(6) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[4:7], v[4:7], v[112:127] +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[8:11], v[8:11], v[96:111] +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[32:35], v[32:35], v[80:95] +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[36:39], v[36:39], v[64:79] +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[40:43], v[48:63] +; GCN-NEXT: s_nop 9 +; GCN-NEXT: v_accvgpr_write_b32 a32, v48 +; GCN-NEXT: v_accvgpr_write_b32 a33, v49 +; GCN-NEXT: v_accvgpr_write_b32 a34, v50 +; GCN-NEXT: v_accvgpr_write_b32 a35, v51 +; GCN-NEXT: v_accvgpr_write_b32 a36, v52 +; GCN-NEXT: v_accvgpr_write_b32 a37, v53 +; GCN-NEXT: v_accvgpr_write_b32 a38, v54 +; GCN-NEXT: v_accvgpr_write_b32 a39, v55 +; GCN-NEXT: v_accvgpr_write_b32 a40, v56 +; GCN-NEXT: v_accvgpr_write_b32 a41, v57 +; GCN-NEXT: v_accvgpr_write_b32 a42, v58 +; GCN-NEXT: v_accvgpr_write_b32 a43, v59 +; GCN-NEXT: v_accvgpr_write_b32 a44, v60 +; GCN-NEXT: v_accvgpr_write_b32 a45, v61 +; GCN-NEXT: v_accvgpr_write_b32 a46, v62 +; GCN-NEXT: v_accvgpr_write_b32 a47, v63 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[44:47], v[44:47], v[16:31] +; GCN-NEXT: v_accvgpr_write_b32 a16, v16 +; GCN-NEXT: v_accvgpr_write_b32 a17, v17 +; GCN-NEXT: v_accvgpr_write_b32 a18, v18 +; GCN-NEXT: v_accvgpr_write_b32 a19, v19 +; GCN-NEXT: v_accvgpr_write_b32 a20, v20 +; GCN-NEXT: v_accvgpr_write_b32 a21, v21 +; GCN-NEXT: v_accvgpr_write_b32 a22, v22 +; GCN-NEXT: v_accvgpr_write_b32 a23, v23 +; GCN-NEXT: v_accvgpr_write_b32 a24, v24 +; GCN-NEXT: v_accvgpr_write_b32 a25, v25 +; GCN-NEXT: v_accvgpr_write_b32 a26, v26 +; GCN-NEXT: v_accvgpr_write_b32 a27, v27 +; GCN-NEXT: v_accvgpr_write_b32 a28, v28 +; GCN-NEXT: v_accvgpr_write_b32 a29, v29 +; GCN-NEXT: v_accvgpr_write_b32 a30, v30 +; GCN-NEXT: v_accvgpr_write_b32 a31, v31 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[12:15], v[12:15], v[48:63] +; GCN-NEXT: s_cbranch_vccnz .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %bb.2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: v_accvgpr_write_b32 a0, v48 +; GCN-NEXT: v_accvgpr_write_b32 a1, v49 +; GCN-NEXT: v_accvgpr_write_b32 a2, v50 +; GCN-NEXT: v_accvgpr_write_b32 a3, v51 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_accvgpr_write_b32 a4, v52 +; GCN-NEXT: v_accvgpr_write_b32 a5, v53 +; GCN-NEXT: v_accvgpr_write_b32 a6, v54 +; GCN-NEXT: v_accvgpr_write_b32 a7, v55 +; GCN-NEXT: v_accvgpr_write_b32 a8, v56 +; GCN-NEXT: v_accvgpr_write_b32 a9, v57 +; GCN-NEXT: v_accvgpr_write_b32 a10, v58 +; GCN-NEXT: v_accvgpr_write_b32 a11, v59 +; GCN-NEXT: v_accvgpr_write_b32 a12, v60 +; GCN-NEXT: v_accvgpr_write_b32 a13, v61 +; GCN-NEXT: v_accvgpr_write_b32 a14, v62 +; GCN-NEXT: v_accvgpr_write_b32 a15, v63 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[244:247] offset:16 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[240:243] +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[224:227] offset:32 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[228:231] offset:48 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[208:211] offset:64 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[212:215] offset:80 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[192:195] offset:96 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[196:199] offset:112 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[180:183] offset:144 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[176:179] offset:128 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[160:163] offset:160 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[164:167] offset:176 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[148:151] offset:208 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[156:159] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[152:155] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[144:147] offset:192 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[140:143] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[136:139] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[132:135] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[128:131] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[124:127] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[120:123] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[116:119] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[112:115] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[108:111] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[104:107] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[100:103] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[92:95] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[88:91] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[76:79] offset:400 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[72:75] offset:384 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[44:47] offset:432 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[40:43] offset:416 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[36:39] offset:400 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[32:35] offset:384 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[28:31] offset:464 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[24:27] offset:448 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[20:23] offset:432 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[16:19] offset:416 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:496 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:480 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:464 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[0:3] offset:448 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:528 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:512 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:496 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:480 +; GCN-NEXT: s_endpgm %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 @@ -292,6 +618,177 @@ define amdgpu_kernel void @good_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, ; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 ; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 ; CHECK-NEXT: s_endpgm +; INFLATE-LABEL: good_rp: +; INFLATE: ; %bb.0: +; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x10 +; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x0 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: s_bitcmp1_b32 s0, 0 +; INFLATE-NEXT: v_mov_b32_e32 v0, s1 +; INFLATE-NEXT: ds_read_b128 v[176:179], v0 +; INFLATE-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; INFLATE-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; INFLATE-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; INFLATE-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; INFLATE-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; INFLATE-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; INFLATE-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; INFLATE-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; INFLATE-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; INFLATE-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0 +; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; INFLATE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; INFLATE-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; INFLATE-NEXT: .LBB1_1: ; %bb.1 +; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1 +; INFLATE-NEXT: s_waitcnt lgkmcnt(10) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; INFLATE-NEXT: s_and_b64 vcc, exec, s[0:1] +; INFLATE-NEXT: s_waitcnt lgkmcnt(9) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; INFLATE-NEXT: s_waitcnt lgkmcnt(8) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; INFLATE-NEXT: s_waitcnt lgkmcnt(7) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; INFLATE-NEXT: s_waitcnt lgkmcnt(6) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; INFLATE-NEXT: s_waitcnt lgkmcnt(5) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; INFLATE-NEXT: s_waitcnt lgkmcnt(4) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; INFLATE-NEXT: s_waitcnt lgkmcnt(3) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; INFLATE-NEXT: s_waitcnt lgkmcnt(2) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; INFLATE-NEXT: s_waitcnt lgkmcnt(1) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; INFLATE-NEXT: s_cbranch_vccnz .LBB1_1 +; INFLATE-NEXT: ; %bb.2: ; %bb.2 +; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; INFLATE-NEXT: s_endpgm +; +; GCN-LABEL: good_rp: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[4:5], 0x10 +; GCN-NEXT: s_load_dword s1, s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_read_b128 v[176:179], v0 +; GCN-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; GCN-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; GCN-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; GCN-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; GCN-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; GCN-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; GCN-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; GCN-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; GCN-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; GCN-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GCN-NEXT: .LBB1_1: ; %bb.1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(10) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; GCN-NEXT: s_waitcnt lgkmcnt(6) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; GCN-NEXT: s_cbranch_vccnz .LBB1_1 +; GCN-NEXT: ; %bb.2: ; %bb.2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; GCN-NEXT: s_endpgm %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 From 896fb4bd408ce5decbd57511cadbf9b48d754cbc Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 15 Oct 2025 07:21:11 -0700 Subject: [PATCH 5/5] Update llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp Co-authored-by: Jay Foad --- llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp index b7dbee9c32130..1da795dd5d138 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -131,10 +131,7 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { if (!InflateToAVClass) continue; - for (MachineOperand &Op : MI.operands()) { - if (!Op.isReg() || !Op.isDef()) - continue; - + for (MachineOperand &Op : MI.all_defs()) { Register DefReg = Op.getReg(); if (DefReg.isPhysical()) continue;