|
| 1 | +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 |
| 2 | +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s |
| 3 | + |
| 4 | +--- | |
| 5 | + ; ModuleID = 'test-wmma-convergent' |
| 6 | + target triple = "amdgcn-amd-amdhsa" |
| 7 | + |
| 8 | + define void @wmma_test(ptr addrspace(4) %a, ptr addrspace(4) %b, ptr addrspace(4) %c, float %scale) { |
| 9 | + entry: |
| 10 | + br label %if.then |
| 11 | + |
| 12 | + if.then: |
| 13 | + br label %if.end |
| 14 | + |
| 15 | + if.end: |
| 16 | + ret void |
| 17 | + } |
| 18 | + |
| 19 | +... |
| 20 | +--- |
| 21 | +name: wmma_test |
| 22 | +alignment: 1 |
| 23 | +tracksRegLiveness: true |
| 24 | +body: | |
| 25 | + ; CHECK-LABEL: name: wmma_test |
| 26 | + ; CHECK: bb.0.entry: |
| 27 | + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) |
| 28 | + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 |
| 29 | + ; CHECK-NEXT: {{ $}} |
| 30 | + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 |
| 31 | + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 |
| 32 | + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), addrspace 4) |
| 33 | + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 |
| 34 | + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2 |
| 35 | + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 |
| 36 | + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0 |
| 37 | + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 |
| 38 | + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 |
| 39 | + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 |
| 40 | + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec |
| 41 | + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1) |
| 42 | + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1) |
| 43 | + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3 |
| 44 | + ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY6]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 45 | + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub2 |
| 46 | + ; CHECK-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY7]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 47 | + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub1 |
| 48 | + ; CHECK-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY8]], 8, [[COPY8]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 49 | + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0 |
| 50 | + ; CHECK-NEXT: [[V_PK_ADD_F16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY9]], 8, [[COPY9]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 51 | + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_F16_3]], %subreg.sub0, [[V_PK_ADD_F16_2]], %subreg.sub1, [[V_PK_ADD_F16_1]], %subreg.sub2, [[V_PK_ADD_F16_]], %subreg.sub3 |
| 52 | + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub3 |
| 53 | + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 939538432 |
| 54 | + ; CHECK-NEXT: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY10]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 55 | + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub2 |
| 56 | + ; CHECK-NEXT: [[V_PK_MUL_F16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY11]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 57 | + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub1 |
| 58 | + ; CHECK-NEXT: [[V_PK_MUL_F16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY12]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 59 | + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub0 |
| 60 | + ; CHECK-NEXT: [[V_PK_MUL_F16_3:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY13]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 61 | + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_MUL_F16_3]], %subreg.sub0, [[V_PK_MUL_F16_2]], %subreg.sub1, [[V_PK_MUL_F16_1]], %subreg.sub2, [[V_PK_MUL_F16_]], %subreg.sub3 |
| 62 | + ; CHECK-NEXT: early-clobber %42:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[REG_SEQUENCE2]], 8, [[REG_SEQUENCE3]], 8, 0, 0, 0, implicit $exec |
| 63 | + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1 |
| 64 | + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[S_MOV_B32_2]], implicit $exec |
| 65 | + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 |
| 66 | + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[V_AND_B32_e64_]], [[S_MOV_B32_3]], implicit $exec |
| 67 | + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec |
| 68 | + ; CHECK-NEXT: S_BRANCH %bb.1 |
| 69 | + ; CHECK-NEXT: {{ $}} |
| 70 | + ; CHECK-NEXT: bb.1.if.then: |
| 71 | + ; CHECK-NEXT: successors: %bb.2(0x80000000) |
| 72 | + ; CHECK-NEXT: {{ $}} |
| 73 | + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) |
| 74 | + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 |
| 75 | + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 |
| 76 | + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_192 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3, [[COPY15]], %subreg.sub4, [[COPY14]], %subreg.sub5 |
| 77 | + ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 3 |
| 78 | + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_4]], [[COPY1]], implicit $exec |
| 79 | + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec |
| 80 | + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B32_e64_1]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 |
| 81 | + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub5 |
| 82 | + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub4 |
| 83 | + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY17]], %subreg.sub0, [[COPY16]], %subreg.sub1 |
| 84 | + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) |
| 85 | + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0 |
| 86 | + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY18]], implicit $exec |
| 87 | + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY %42.sub1 |
| 88 | + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %42.sub3 |
| 89 | + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %42.sub5 |
| 90 | + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY %42.sub7 |
| 91 | + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY %42.sub6 |
| 92 | + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY %42.sub4 |
| 93 | + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY %42.sub2 |
| 94 | + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY %42.sub0 |
| 95 | + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 96 | + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[DEF]] |
| 97 | + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY26]], 0, 0, 0, [[COPY27]], 0, 0, implicit $mode, implicit $exec |
| 98 | + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 99 | + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] |
| 100 | + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY25]], 0, 0, 0, [[COPY28]], 0, 0, implicit $mode, implicit $exec |
| 101 | + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 102 | + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] |
| 103 | + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY24]], 0, 0, 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec |
| 104 | + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 105 | + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] |
| 106 | + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY23]], 0, 0, 0, [[COPY30]], 0, 0, implicit $mode, implicit $exec |
| 107 | + ; CHECK-NEXT: [[V_FMA_MIXHI_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY22]], 0, 0, 0, [[V_FMA_MIXLO_F16_3]], 0, 0, implicit $mode, implicit $exec |
| 108 | + ; CHECK-NEXT: [[V_FMA_MIXHI_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY21]], 0, 0, 0, [[V_FMA_MIXLO_F16_2]], 0, 0, implicit $mode, implicit $exec |
| 109 | + ; CHECK-NEXT: [[V_FMA_MIXHI_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY20]], 0, 0, 0, [[V_FMA_MIXLO_F16_1]], 0, 0, implicit $mode, implicit $exec |
| 110 | + ; CHECK-NEXT: [[V_FMA_MIXHI_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY19]], 0, 0, 0, [[V_FMA_MIXLO_F16_]], 0, 0, implicit $mode, implicit $exec |
| 111 | + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_FMA_MIXHI_F16_3]], %subreg.sub0, [[V_FMA_MIXHI_F16_2]], %subreg.sub1, [[V_FMA_MIXHI_F16_1]], %subreg.sub2, [[V_FMA_MIXHI_F16_]], %subreg.sub3 |
| 112 | + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE7]] |
| 113 | + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_LSHLREV_B32_e64_2]], [[COPY31]], [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (store (s128), addrspace 1) |
| 114 | + ; CHECK-NEXT: {{ $}} |
| 115 | + ; CHECK-NEXT: bb.2.if.end: |
| 116 | + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec |
| 117 | + ; CHECK-NEXT: S_ENDPGM 0 |
| 118 | + bb.0.entry: |
| 119 | + successors: %bb.1(0x40000000), %bb.2(0x40000000) |
| 120 | + liveins: $vgpr0, $sgpr0_sgpr1 |
| 121 | +
|
| 122 | + %6:sgpr_64 = COPY $sgpr0_sgpr1 |
| 123 | + %5:vgpr_32 = COPY $vgpr0 |
| 124 | + %7:sgpr_128 = S_LOAD_DWORDX4_IMM %6:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4) |
| 125 | + %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %6:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) |
| 126 | + %9:sreg_32 = COPY %8.sub1:sreg_64_xexec |
| 127 | + %10:sreg_32 = COPY %8.sub0:sreg_64_xexec |
| 128 | + %11:sreg_32 = COPY %7.sub3:sgpr_128 |
| 129 | + %12:sreg_32 = COPY %7.sub2:sgpr_128 |
| 130 | + %13:sreg_32 = COPY %7.sub1:sgpr_128 |
| 131 | + %14:sreg_32 = COPY %7.sub0:sgpr_128 |
| 132 | + %15:sgpr_192 = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1, %12:sreg_32, %subreg.sub2, %11:sreg_32, %subreg.sub3, %10:sreg_32, %subreg.sub4, %9:sreg_32, %subreg.sub5 |
| 133 | + %1:sgpr_192 = COPY %15:sgpr_192 |
| 134 | + %16:sreg_64_xexec_xnull = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1 |
| 135 | + %17:sreg_64_xexec_xnull = REG_SEQUENCE %12:sreg_32, %subreg.sub0, %11:sreg_32, %subreg.sub1 |
| 136 | + %18:sreg_32 = S_MOV_B32 3 |
| 137 | + %19:vgpr_32 = V_LSHLREV_B32_e64 %18:sreg_32, %5:vgpr_32, implicit $exec |
| 138 | + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec |
| 139 | + %100:vreg_64 = REG_SEQUENCE %19:vgpr_32, %subreg.sub0, %101:vgpr_32, %subreg.sub1 |
| 140 | + %2:vreg_64 = COPY %100:vreg_64 |
| 141 | + %22:sreg_32 = S_MOV_B32 4 |
| 142 | + %23:vgpr_32 = V_LSHLREV_B32_e64 %22:sreg_32, %5:vgpr_32, implicit $exec |
| 143 | + %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %16:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1) |
| 144 | + %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %17:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1) |
| 145 | + %26:vgpr_32 = COPY %24.sub3:vreg_128 |
| 146 | + %27:vgpr_32 = V_PK_ADD_F16 8, %26:vgpr_32, 8, %26:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 147 | + %28:vgpr_32 = COPY %24.sub2:vreg_128 |
| 148 | + %29:vgpr_32 = V_PK_ADD_F16 8, %28:vgpr_32, 8, %28:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 149 | + %30:vgpr_32 = COPY %24.sub1:vreg_128 |
| 150 | + %31:vgpr_32 = V_PK_ADD_F16 8, %30:vgpr_32, 8, %30:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 151 | + %32:vgpr_32 = COPY %24.sub0:vreg_128 |
| 152 | + %33:vgpr_32 = V_PK_ADD_F16 8, %32:vgpr_32, 8, %32:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 153 | + %99:vreg_128 = REG_SEQUENCE %33:vgpr_32, %subreg.sub0, %31:vgpr_32, %subreg.sub1, %29:vgpr_32, %subreg.sub2, %27:vgpr_32, %subreg.sub3 |
| 154 | + %35:vgpr_32 = COPY %25.sub3:vreg_128 |
| 155 | + %36:sreg_32 = S_MOV_B32 939538432 |
| 156 | + %37:vgpr_32 = V_PK_MUL_F16 8, %35:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 157 | + %38:vgpr_32 = COPY %25.sub2:vreg_128 |
| 158 | + %39:vgpr_32 = V_PK_MUL_F16 8, %38:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 159 | + %40:vgpr_32 = COPY %25.sub1:vreg_128 |
| 160 | + %41:vgpr_32 = V_PK_MUL_F16 8, %40:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 161 | + %42:vgpr_32 = COPY %25.sub0:vreg_128 |
| 162 | + %43:vgpr_32 = V_PK_MUL_F16 8, %42:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 163 | + %98:vreg_128 = REG_SEQUENCE %43:vgpr_32, %subreg.sub0, %41:vgpr_32, %subreg.sub1, %39:vgpr_32, %subreg.sub2, %37:vgpr_32, %subreg.sub3 |
| 164 | + early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %99:vreg_128, 8, %98:vreg_128, 8, 0, 0, 0, implicit $exec |
| 165 | + %47:sreg_32 = S_MOV_B32 1 |
| 166 | + %48:vgpr_32 = V_AND_B32_e64 %5:vgpr_32, %47:sreg_32, implicit $exec |
| 167 | + %49:sreg_32 = S_MOV_B32 0 |
| 168 | + %50:sreg_32 = V_CMP_EQ_U32_e64 %48:vgpr_32, %49:sreg_32, implicit $exec |
| 169 | + %4:sreg_32 = SI_IF %50:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec |
| 170 | + S_BRANCH %bb.1 |
| 171 | +
|
| 172 | + bb.1.if.then: |
| 173 | + successors: %bb.2(0x80000000) |
| 174 | +
|
| 175 | + %51:sreg_32 = COPY %1.sub5:sgpr_192 |
| 176 | + %52:sreg_32 = COPY %1.sub4:sgpr_192 |
| 177 | + %53:sreg_64_xexec_xnull = REG_SEQUENCE %52:sreg_32, %subreg.sub0, %51:sreg_32, %subreg.sub1 |
| 178 | + %54:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %6:sgpr_64, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) |
| 179 | + %55:vgpr_32 = COPY %2.sub0:vreg_64 |
| 180 | + %57:vgpr_32 = V_LSHLREV_B32_e64 %47:sreg_32, %55:vgpr_32, implicit $exec |
| 181 | + %58:vgpr_32 = COPY %3.sub1:vreg_256 |
| 182 | + %59:vgpr_32 = COPY %3.sub3:vreg_256 |
| 183 | + %60:vgpr_32 = COPY %3.sub5:vreg_256 |
| 184 | + %61:vgpr_32 = COPY %3.sub7:vreg_256 |
| 185 | + %62:vgpr_32 = COPY %3.sub6:vreg_256 |
| 186 | + %63:vgpr_32 = COPY %3.sub4:vreg_256 |
| 187 | + %64:vgpr_32 = COPY %3.sub2:vreg_256 |
| 188 | + %65:vgpr_32 = COPY %3.sub0:vreg_256 |
| 189 | + %67:sreg_32 = IMPLICIT_DEF |
| 190 | + %68:vgpr_32 = COPY %67:sreg_32 |
| 191 | + %66:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %65:vgpr_32, 0, 0, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 192 | + %70:sreg_32 = IMPLICIT_DEF |
| 193 | + %71:vgpr_32 = COPY %70:sreg_32 |
| 194 | + %69:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %64:vgpr_32, 0, 0, 0, %71:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 195 | + %73:sreg_32 = IMPLICIT_DEF |
| 196 | + %74:vgpr_32 = COPY %73:sreg_32 |
| 197 | + %72:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %63:vgpr_32, 0, 0, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 198 | + %76:sreg_32 = IMPLICIT_DEF |
| 199 | + %77:vgpr_32 = COPY %76:sreg_32 |
| 200 | + %75:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %62:vgpr_32, 0, 0, 0, %77:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 201 | + %78:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %61:vgpr_32, 0, 0, 0, %75:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 202 | + %79:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %60:vgpr_32, 0, 0, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 203 | + %80:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %59:vgpr_32, 0, 0, 0, %69:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 204 | + %81:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %58:vgpr_32, 0, 0, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec |
| 205 | + %97:vreg_128 = REG_SEQUENCE %81:vgpr_32, %subreg.sub0, %80:vgpr_32, %subreg.sub1, %79:vgpr_32, %subreg.sub2, %78:vgpr_32, %subreg.sub3 |
| 206 | + %83:vreg_128 = COPY %97:vreg_128 |
| 207 | + GLOBAL_STORE_DWORDX4_SADDR %57:vgpr_32, %83:vreg_128, %53:sreg_64_xexec_xnull, 0, 0, implicit $exec :: (store (s128), addrspace 1) |
| 208 | +
|
| 209 | + bb.2.if.end: |
| 210 | +
|
| 211 | + SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec |
| 212 | + S_ENDPGM 0 |
| 213 | +
|
| 214 | +... |
0 commit comments