Skip to content

Commit 14454de

Browse files
committed
[AMDGPU] WMMA convergent flag fix
1 parent 24c22b7 commit 14454de

File tree

2 files changed

+220
-4
lines changed

2 files changed

+220
-4
lines changed

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
17071707
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
17081708
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
17091709

1710-
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
1710+
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
17111711
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
17121712
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
17131713
let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
17341734
let mayRaiseFPException = 0;
17351735
let ReadsModeReg = 0;
17361736
let AsmMatchConverter = "cvtSWMMAC";
1737-
1737+
let isConvergent = 1;
17381738
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
17391739
}
17401740
}
@@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16
19061906
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
19071907
} // End is_wmma_xdl = 1.
19081908

1909-
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
1910-
defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
1909+
let isConvergent = 1 in {
1910+
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
1911+
defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
1912+
}
19111913
} // End SubtargetPredicate = isGFX125xOnly
19121914
} // End WaveSizePredicate = isWave32
19131915

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
3+
4+
--- |
5+
; ModuleID = 'test-wmma-convergent'
6+
target triple = "amdgcn-amd-amdhsa"
7+
8+
define void @wmma_test(ptr addrspace(4) %a, ptr addrspace(4) %b, ptr addrspace(4) %c, float %scale) {
9+
entry:
10+
br label %if.then
11+
12+
if.then:
13+
br label %if.end
14+
15+
if.end:
16+
ret void
17+
}
18+
19+
...
20+
---
21+
name: wmma_test
22+
alignment: 1
23+
tracksRegLiveness: true
24+
body: |
25+
; CHECK-LABEL: name: wmma_test
26+
; CHECK: bb.0.entry:
27+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
28+
; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
29+
; CHECK-NEXT: {{ $}}
30+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
31+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
32+
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
33+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
34+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
35+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
36+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
37+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
38+
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
39+
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
40+
; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec
41+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
42+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
43+
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3
44+
; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY6]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
45+
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub2
46+
; CHECK-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY7]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
47+
; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub1
48+
; CHECK-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY8]], 8, [[COPY8]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
49+
; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0
50+
; CHECK-NEXT: [[V_PK_ADD_F16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY9]], 8, [[COPY9]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
51+
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_F16_3]], %subreg.sub0, [[V_PK_ADD_F16_2]], %subreg.sub1, [[V_PK_ADD_F16_1]], %subreg.sub2, [[V_PK_ADD_F16_]], %subreg.sub3
52+
; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub3
53+
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 939538432
54+
; CHECK-NEXT: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY10]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
55+
; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub2
56+
; CHECK-NEXT: [[V_PK_MUL_F16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY11]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
57+
; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub1
58+
; CHECK-NEXT: [[V_PK_MUL_F16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY12]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
59+
; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR1]].sub0
60+
; CHECK-NEXT: [[V_PK_MUL_F16_3:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY13]], 8, [[S_MOV_B32_1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
61+
; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_MUL_F16_3]], %subreg.sub0, [[V_PK_MUL_F16_2]], %subreg.sub1, [[V_PK_MUL_F16_1]], %subreg.sub2, [[V_PK_MUL_F16_]], %subreg.sub3
62+
; CHECK-NEXT: early-clobber %42:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[REG_SEQUENCE2]], 8, [[REG_SEQUENCE3]], 8, 0, 0, 0, implicit $exec
63+
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1
64+
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[S_MOV_B32_2]], implicit $exec
65+
; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
66+
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[V_AND_B32_e64_]], [[S_MOV_B32_3]], implicit $exec
67+
; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
68+
; CHECK-NEXT: S_BRANCH %bb.1
69+
; CHECK-NEXT: {{ $}}
70+
; CHECK-NEXT: bb.1.if.then:
71+
; CHECK-NEXT: successors: %bb.2(0x80000000)
72+
; CHECK-NEXT: {{ $}}
73+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
74+
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
75+
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
76+
; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_192 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3, [[COPY15]], %subreg.sub4, [[COPY14]], %subreg.sub5
77+
; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 3
78+
; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_4]], [[COPY1]], implicit $exec
79+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
80+
; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B32_e64_1]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
81+
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub5
82+
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub4
83+
; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY17]], %subreg.sub0, [[COPY16]], %subreg.sub1
84+
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
85+
; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0
86+
; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY18]], implicit $exec
87+
; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY %42.sub1
88+
; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %42.sub3
89+
; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %42.sub5
90+
; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY %42.sub7
91+
; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY %42.sub6
92+
; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY %42.sub4
93+
; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY %42.sub2
94+
; CHECK-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY %42.sub0
95+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
96+
; CHECK-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
97+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY26]], 0, 0, 0, [[COPY27]], 0, 0, implicit $mode, implicit $exec
98+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
99+
; CHECK-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
100+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY25]], 0, 0, 0, [[COPY28]], 0, 0, implicit $mode, implicit $exec
101+
; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
102+
; CHECK-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
103+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY24]], 0, 0, 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec
104+
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
105+
; CHECK-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
106+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY23]], 0, 0, 0, [[COPY30]], 0, 0, implicit $mode, implicit $exec
107+
; CHECK-NEXT: [[V_FMA_MIXHI_F16_:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY22]], 0, 0, 0, [[V_FMA_MIXLO_F16_3]], 0, 0, implicit $mode, implicit $exec
108+
; CHECK-NEXT: [[V_FMA_MIXHI_F16_1:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY21]], 0, 0, 0, [[V_FMA_MIXLO_F16_2]], 0, 0, implicit $mode, implicit $exec
109+
; CHECK-NEXT: [[V_FMA_MIXHI_F16_2:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY20]], 0, 0, 0, [[V_FMA_MIXLO_F16_1]], 0, 0, implicit $mode, implicit $exec
110+
; CHECK-NEXT: [[V_FMA_MIXHI_F16_3:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16 0, [[S_LOAD_DWORD_IMM]], 0, [[COPY19]], 0, 0, 0, [[V_FMA_MIXLO_F16_]], 0, 0, implicit $mode, implicit $exec
111+
; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_FMA_MIXHI_F16_3]], %subreg.sub0, [[V_FMA_MIXHI_F16_2]], %subreg.sub1, [[V_FMA_MIXHI_F16_1]], %subreg.sub2, [[V_FMA_MIXHI_F16_]], %subreg.sub3
112+
; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE7]]
113+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_LSHLREV_B32_e64_2]], [[COPY31]], [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
114+
; CHECK-NEXT: {{ $}}
115+
; CHECK-NEXT: bb.2.if.end:
116+
; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
117+
; CHECK-NEXT: S_ENDPGM 0
118+
bb.0.entry:
119+
successors: %bb.1(0x40000000), %bb.2(0x40000000)
120+
liveins: $vgpr0, $sgpr0_sgpr1
121+
122+
%6:sgpr_64 = COPY $sgpr0_sgpr1
123+
%5:vgpr_32 = COPY $vgpr0
124+
%7:sgpr_128 = S_LOAD_DWORDX4_IMM %6:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
125+
%8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %6:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
126+
%9:sreg_32 = COPY %8.sub1:sreg_64_xexec
127+
%10:sreg_32 = COPY %8.sub0:sreg_64_xexec
128+
%11:sreg_32 = COPY %7.sub3:sgpr_128
129+
%12:sreg_32 = COPY %7.sub2:sgpr_128
130+
%13:sreg_32 = COPY %7.sub1:sgpr_128
131+
%14:sreg_32 = COPY %7.sub0:sgpr_128
132+
%15:sgpr_192 = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1, %12:sreg_32, %subreg.sub2, %11:sreg_32, %subreg.sub3, %10:sreg_32, %subreg.sub4, %9:sreg_32, %subreg.sub5
133+
%1:sgpr_192 = COPY %15:sgpr_192
134+
%16:sreg_64_xexec_xnull = REG_SEQUENCE %14:sreg_32, %subreg.sub0, %13:sreg_32, %subreg.sub1
135+
%17:sreg_64_xexec_xnull = REG_SEQUENCE %12:sreg_32, %subreg.sub0, %11:sreg_32, %subreg.sub1
136+
%18:sreg_32 = S_MOV_B32 3
137+
%19:vgpr_32 = V_LSHLREV_B32_e64 %18:sreg_32, %5:vgpr_32, implicit $exec
138+
%101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
139+
%100:vreg_64 = REG_SEQUENCE %19:vgpr_32, %subreg.sub0, %101:vgpr_32, %subreg.sub1
140+
%2:vreg_64 = COPY %100:vreg_64
141+
%22:sreg_32 = S_MOV_B32 4
142+
%23:vgpr_32 = V_LSHLREV_B32_e64 %22:sreg_32, %5:vgpr_32, implicit $exec
143+
%24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %16:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1)
144+
%25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %17:sreg_64_xexec_xnull, %23:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 1)
145+
%26:vgpr_32 = COPY %24.sub3:vreg_128
146+
%27:vgpr_32 = V_PK_ADD_F16 8, %26:vgpr_32, 8, %26:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
147+
%28:vgpr_32 = COPY %24.sub2:vreg_128
148+
%29:vgpr_32 = V_PK_ADD_F16 8, %28:vgpr_32, 8, %28:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
149+
%30:vgpr_32 = COPY %24.sub1:vreg_128
150+
%31:vgpr_32 = V_PK_ADD_F16 8, %30:vgpr_32, 8, %30:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
151+
%32:vgpr_32 = COPY %24.sub0:vreg_128
152+
%33:vgpr_32 = V_PK_ADD_F16 8, %32:vgpr_32, 8, %32:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
153+
%99:vreg_128 = REG_SEQUENCE %33:vgpr_32, %subreg.sub0, %31:vgpr_32, %subreg.sub1, %29:vgpr_32, %subreg.sub2, %27:vgpr_32, %subreg.sub3
154+
%35:vgpr_32 = COPY %25.sub3:vreg_128
155+
%36:sreg_32 = S_MOV_B32 939538432
156+
%37:vgpr_32 = V_PK_MUL_F16 8, %35:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
157+
%38:vgpr_32 = COPY %25.sub2:vreg_128
158+
%39:vgpr_32 = V_PK_MUL_F16 8, %38:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
159+
%40:vgpr_32 = COPY %25.sub1:vreg_128
160+
%41:vgpr_32 = V_PK_MUL_F16 8, %40:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
161+
%42:vgpr_32 = COPY %25.sub0:vreg_128
162+
%43:vgpr_32 = V_PK_MUL_F16 8, %42:vgpr_32, 8, %36:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
163+
%98:vreg_128 = REG_SEQUENCE %43:vgpr_32, %subreg.sub0, %41:vgpr_32, %subreg.sub1, %39:vgpr_32, %subreg.sub2, %37:vgpr_32, %subreg.sub3
164+
early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %99:vreg_128, 8, %98:vreg_128, 8, 0, 0, 0, implicit $exec
165+
%47:sreg_32 = S_MOV_B32 1
166+
%48:vgpr_32 = V_AND_B32_e64 %5:vgpr_32, %47:sreg_32, implicit $exec
167+
%49:sreg_32 = S_MOV_B32 0
168+
%50:sreg_32 = V_CMP_EQ_U32_e64 %48:vgpr_32, %49:sreg_32, implicit $exec
169+
%4:sreg_32 = SI_IF %50:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
170+
S_BRANCH %bb.1
171+
172+
bb.1.if.then:
173+
successors: %bb.2(0x80000000)
174+
175+
%51:sreg_32 = COPY %1.sub5:sgpr_192
176+
%52:sreg_32 = COPY %1.sub4:sgpr_192
177+
%53:sreg_64_xexec_xnull = REG_SEQUENCE %52:sreg_32, %subreg.sub0, %51:sreg_32, %subreg.sub1
178+
%54:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %6:sgpr_64, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
179+
%55:vgpr_32 = COPY %2.sub0:vreg_64
180+
%57:vgpr_32 = V_LSHLREV_B32_e64 %47:sreg_32, %55:vgpr_32, implicit $exec
181+
%58:vgpr_32 = COPY %3.sub1:vreg_256
182+
%59:vgpr_32 = COPY %3.sub3:vreg_256
183+
%60:vgpr_32 = COPY %3.sub5:vreg_256
184+
%61:vgpr_32 = COPY %3.sub7:vreg_256
185+
%62:vgpr_32 = COPY %3.sub6:vreg_256
186+
%63:vgpr_32 = COPY %3.sub4:vreg_256
187+
%64:vgpr_32 = COPY %3.sub2:vreg_256
188+
%65:vgpr_32 = COPY %3.sub0:vreg_256
189+
%67:sreg_32 = IMPLICIT_DEF
190+
%68:vgpr_32 = COPY %67:sreg_32
191+
%66:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %65:vgpr_32, 0, 0, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
192+
%70:sreg_32 = IMPLICIT_DEF
193+
%71:vgpr_32 = COPY %70:sreg_32
194+
%69:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %64:vgpr_32, 0, 0, 0, %71:vgpr_32, 0, 0, implicit $mode, implicit $exec
195+
%73:sreg_32 = IMPLICIT_DEF
196+
%74:vgpr_32 = COPY %73:sreg_32
197+
%72:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %63:vgpr_32, 0, 0, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
198+
%76:sreg_32 = IMPLICIT_DEF
199+
%77:vgpr_32 = COPY %76:sreg_32
200+
%75:vgpr_32 = V_FMA_MIXLO_F16 0, %54:sreg_32_xm0_xexec, 0, %62:vgpr_32, 0, 0, 0, %77:vgpr_32, 0, 0, implicit $mode, implicit $exec
201+
%78:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %61:vgpr_32, 0, 0, 0, %75:vgpr_32, 0, 0, implicit $mode, implicit $exec
202+
%79:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %60:vgpr_32, 0, 0, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
203+
%80:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %59:vgpr_32, 0, 0, 0, %69:vgpr_32, 0, 0, implicit $mode, implicit $exec
204+
%81:vgpr_32 = V_FMA_MIXHI_F16 0, %54:sreg_32_xm0_xexec, 0, %58:vgpr_32, 0, 0, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
205+
%97:vreg_128 = REG_SEQUENCE %81:vgpr_32, %subreg.sub0, %80:vgpr_32, %subreg.sub1, %79:vgpr_32, %subreg.sub2, %78:vgpr_32, %subreg.sub3
206+
%83:vreg_128 = COPY %97:vreg_128
207+
GLOBAL_STORE_DWORDX4_SADDR %57:vgpr_32, %83:vreg_128, %53:sreg_64_xexec_xnull, 0, 0, implicit $exec :: (store (s128), addrspace 1)
208+
209+
bb.2.if.end:
210+
211+
SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
212+
S_ENDPGM 0
213+
214+
...

0 commit comments

Comments
 (0)