diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir new file mode 100644 index 0000000000000..cf1546625b80c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir @@ -0,0 +1,313 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s + +--- | + + define amdgpu_kernel void @inflate_result_to_agpr__not_mfma() { + ret void + } + + ; FIXME: Need IR for amdgpu-waves-per-eu + define amdgpu_kernel void @av_split_is_phi_def_crash() #0 { + ret void + } + + define amdgpu_kernel void @copy_physreg_to_av_crash() #0 { + ret void + } + + attributes #0 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="8,8" } +... + +# Inflate pattern, except the defining instruction isn't an MFMA. +--- +name: inflate_result_to_agpr__not_mfma +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__not_mfma + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit renamable $vgpr2_vgpr3, implicit renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %4.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) + S_NOP 0, implicit-def %0:vreg_512_align2, implicit %1, implicit %4 + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +--- +name: av_split_is_phi_def_crash +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + waveLimiter: true + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr68_sgpr69_sgpr70_sgpr71' + stackPtrOffsetReg: '$sgpr32' + occupancy: 8 + wwmReservedRegs: + - '$vgpr31' + sgprForEXECCopy: '$sgpr72_sgpr73' +body: | + ; CHECK-LABEL: name: av_split_is_phi_def_crash + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr2_sgpr3, $sgpr8_sgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr8, $vgpr0_vgpr1:0x0000000000000003 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $vgpr8, $vgpr31, $sgpr8_sgpr9, $vgpr0_vgpr1:0x0000000000000003, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $vgpr8 + ; CHECK-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr2_vgpr3 + ; CHECK-NEXT: renamable $vgpr2 = COPY renamable $vgpr1 + ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $vgpr8 + ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $vgpr1 + ; CHECK-NEXT: dead renamable $vgpr7 = COPY renamable $vgpr8 + ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $vgpr1 + ; CHECK-NEXT: renamable $vgpr7 = COPY renamable $vgpr8 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: renamable $vgpr10_vgpr11 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr14_vgpr15 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr16_vgpr17 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr18_vgpr19 = V_ADD_F64_e64 0, undef $sgpr2_sgpr3, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr20_vgpr21 = V_ADD_F64_e64 0, undef $sgpr28_sgpr29, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr22_vgpr23 = V_ADD_F64_e64 0, undef $sgpr24_sgpr25, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr24_vgpr25 = V_ADD_F64_e64 0, undef $sgpr30_sgpr31, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr26_vgpr27 = V_ADD_F64_e64 0, undef $sgpr26_sgpr27, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr29 = COPY renamable $vgpr12 + ; CHECK-NEXT: renamable $vgpr28 = COPY renamable $vgpr12 + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = V_ADD_F64_e64 0, killed $vgpr28_vgpr29, 0, killed $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr28_vgpr29 = COPY killed renamable $agpr0_agpr1 + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = V_ADD_F64_e64 0, killed $vgpr4_vgpr5, 0, killed $vgpr28_vgpr29, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr11 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr14 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr20 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr24 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr26 + ; CHECK-NEXT: renamable $vgpr4 = COPY killed renamable $vgpr6 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr16 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr22 + ; CHECK-NEXT: dead renamable $vgpr4 = COPY killed renamable $vgpr18 + ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $vgpr1 + ; CHECK-NEXT: dead renamable $vgpr3 = COPY killed renamable $vgpr8 + ; CHECK-NEXT: dead renamable $vgpr1 = COPY renamable $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $sgpr8_sgpr9, $sgpr2_sgpr3, $sgpr8_sgpr9 + + %0:vgpr_32 = IMPLICIT_DEF + %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + undef %2.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + %1:vreg_64_align2 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + + bb.2: + liveins: $vgpr31, $sgpr8_sgpr9 + + %3:vgpr_32 = COPY %0 + %4:vgpr_32 = COPY %0 + undef %5.sub0:vreg_64_align2 = COPY %3 + %5.sub1:vreg_64_align2 = COPY %4 + undef %6.sub0:vreg_64_align2 = COPY %3 + dead %6.sub1:vreg_64_align2 = COPY %4 + undef %7.sub0:vreg_64_align2 = COPY %3 + %7.sub1:vreg_64_align2 = COPY %4 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + %8:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + %9:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + %10:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + %11:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr6_sgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + %12:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr2_sgpr3, 0, 0, 0, 0, implicit $mode, implicit $exec + %13:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr28_sgpr29, 0, 0, 0, 0, implicit $mode, implicit $exec + %14:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr24_sgpr25, 0, 0, 0, 0, implicit $mode, implicit $exec + %15:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr30_sgpr31, 0, 0, 0, 0, implicit $mode, implicit $exec + %16:vreg_64_align2 = V_ADD_F64_e64 0, undef $sgpr26_sgpr27, 0, 0, 0, 0, implicit $mode, implicit $exec + undef %17.sub1:vreg_64_align2 = COPY %9.sub0 + %17.sub0:vreg_64_align2 = COPY %9.sub0 + %18:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + %19:vreg_64_align2 = V_ADD_F64_e64 0, %17, 0, %18, 0, 0, implicit $mode, implicit $exec + %20:vreg_64_align2 = V_ADD_F64_e64 0, %19, 0, %1, 0, 0, implicit $mode, implicit $exec + dead %21:vgpr_32 = COPY %20.sub0 + dead %22:vgpr_32 = COPY %8.sub1 + dead %23:vgpr_32 = COPY %10.sub0 + dead %24:vgpr_32 = COPY %13.sub0 + dead %25:vgpr_32 = COPY %15.sub0 + dead %26:vgpr_32 = COPY %16.sub0 + %27:vgpr_32 = COPY %7.sub0 + %28:vgpr_32 = COPY %27 + S_NOP 0 + dead %29:vgpr_32 = COPY %11.sub0 + dead %30:vgpr_32 = COPY %14.sub0 + dead %31:vgpr_32 = COPY %12.sub0 + %32:vreg_64_align2 = COPY %5 + undef %33.sub0:vreg_64_align2 = COPY %3 + dead %33.sub1:vreg_64_align2 = COPY %4 + dead undef %34.sub1:vreg_64_align2 = COPY %9.sub1 + dead %2.sub1:vreg_64_align2 = COPY %2.sub0 + + bb.3: + S_ENDPGM 0 + +... + +--- +name: copy_physreg_to_av_crash +tracksRegLiveness: true +registers: + - { id: 0, class: av_32, preferred-register: '$agpr0' } +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_physreg_to_av_crash + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0 = COPY $vgpr0 + %0:av_32 = COPY $vgpr0 + $agpr0 = COPY %0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir new file mode 100644 index 0000000000000..171df9343b05d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -0,0 +1,360 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=greedy,2 -stop-after=virtregrewriter,2 -verify-regalloc -verify-machineinstrs -o - %s | FileCheck %s + +# There aren't enough VGPRs for %0 to stay in a VGPR. %0 should be +# split and inflated to AV_512. The VGPR version of the instruction +# should be replaced with the AGPR version, and the copies eliminated. + +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +# Same, except spam some other uses of the register that we shouldn't +# break the liveness of + +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_multi_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_multi_load + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 8, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8 = GLOBAL_LOAD_DWORDX3 undef renamable $vgpr0_vgpr1, 16, 0, implicit $exec :: (load (s96), align 16, addrspace 1) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 28, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR undef renamable $vgpr0, renamable $vgpr6_vgpr7_vgpr8_vgpr9, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) + %0.sub2_sub3:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 8, 0, implicit $exec :: (load (s64), addrspace 1) + %0.sub4_sub5_sub6:vreg_512_align2 = GLOBAL_LOAD_DWORDX3 undef %3:vreg_64_align2, 16, 0, implicit $exec :: (load (s96), addrspace 1) + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 28, 0, 0, implicit $mode, implicit $exec + S_NOP 0, implicit %0 + GLOBAL_STORE_DWORDX4_SADDR undef %4:vgpr_32, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +# Non-mac variant, src2 is other VGPR. We cannot replace this case +# unless we also try to reassign src2 to an AGPR. +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %4.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %4, 0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +# TODO: The two MFMAs are both rewritable as a set, but if we look at +# one at a time the register class constraints of the other block the +# transform. +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av.mir new file mode 100644 index 0000000000000..33554f8525101 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av.mir @@ -0,0 +1,190 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=greedy,2 -stop-after=virtregrewriter,2 -verify-regalloc -verify-machineinstrs -o - %s | FileCheck %s + +# There aren't enough VGPRs for %0 to stay in a VGPR. %0 should be +# split and inflated to AV_512. The VGPR version of the instruction +# can't be replaced with AGPR version since we can't easily replace +# src2's register class + +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: renamable $vgpr11 = COPY renamable $vgpr10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000000F0000 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %0.sub0_sub1:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub8_sub9, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +# Non-mac version, src2 is other VGPR +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr26 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr18_vgpr19 + ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK-NEXT: renamable $vgpr16_vgpr17 = COPY killed renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + ; CHECK-NEXT: renamable $vgpr27 = COPY renamable $vgpr26 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000000F000F + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr26_vgpr27, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr8_vgpr9 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 8, $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr14_vgpr15 = nofpexcept V_PK_MUL_F32 8, killed $vgpr18_vgpr19, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_NOP 0, implicit-def $agpr0 + renamable $sgpr0 = S_MOV_B32 0 + undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr1 = COPY renamable $sgpr0 + %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 + renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + %0.sub9:vreg_512_align2 = COPY %0.sub8 + + bb.1: + liveins: $vcc + + undef %3.sub0_sub1:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub8_sub9, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub2_sub3:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub4_sub5:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub6_sub7:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub8_sub9:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub10_sub11:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub12_sub13:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3.sub14_sub15:vreg_512_align2 = nofpexcept V_PK_MUL_F32 8, %0.sub0_sub1, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + ; No VGPRs available for %0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 + S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 + S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll new file mode 100644 index 0000000000000..0c6339e4f5121 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mcpu=gfx90a < %s | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrspace(1) %arg) #0 { +; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a30, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v31 +; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a59 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a58 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a57 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a56 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a55 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a54 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a53 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a52 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a51 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a50 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a49 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a48 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a47 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a46 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a45 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a44 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a43 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a42 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a41 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a40 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a39 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a38 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a37 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a36 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a35 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a34 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v4 +; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60 +; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: s_endpgm +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id + %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(ptr addrspace(1) %arg) #0 { +; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: s_endpgm +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id + %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0) + store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 + ret void +} + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #1 +declare noundef i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,4" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }