1- ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
1+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
55; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
66; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
77; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
8+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
99; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
1010
1111; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
1212; are not converted from f16 to f32.
13- ; GCN-LABEL: {{^}}dotproduct_f16
13+ ; GCN-LABEL: {{^}}dotproduct_f16_contract
1414; GFX900: v_fma_f16
1515; GFX900: v_fma_f16
1616
17- ; GFX906: v_mul_f16_e32
18- ; GFX906: v_mul_f16_e32
19-
2017; GFX906-DL-UNSAFE: v_fma_f16
2118; GFX10-CONTRACT: v_fmac_f16
2219
2320; GFX906-CONTRACT: v_mac_f16_e32
2421; GFX906-DENORM-CONTRACT: v_fma_f16
2522; GFX906-DOT10-DISABLED: v_fma_f16
23+
24+ define amdgpu_kernel void @dotproduct_f16_contract (ptr addrspace (1 ) %src1 ,
25+ ptr addrspace (1 ) %src2 ,
26+ ptr addrspace (1 ) nocapture %dst ) {
27+ entry:
28+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
29+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
30+
31+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
32+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
33+
34+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
35+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
36+
37+ %mul2 = fmul contract half %src1.el2 , %src2.el2
38+ %mul1 = fmul contract half %src1.el1 , %src2.el1
39+ %acc = load half , ptr addrspace (1 ) %dst , align 2
40+ %acc1 = fadd contract half %mul2 , %acc
41+ %acc2 = fadd contract half %mul1 , %acc1
42+ store half %acc2 , ptr addrspace (1 ) %dst , align 2
43+ ret void
44+ }
45+
46+ ; GCN-LABEL: {{^}}dotproduct_f16
47+
48+ ; GFX906: v_mul_f16_e32
49+ ; GFX906: v_mul_f16_e32
50+
2651define amdgpu_kernel void @dotproduct_f16 (ptr addrspace (1 ) %src1 ,
2752 ptr addrspace (1 ) %src2 ,
2853 ptr addrspace (1 ) nocapture %dst ) {
@@ -45,18 +70,12 @@ entry:
4570 ret void
4671}
4772
48-
4973; We only want to generate fdot2 if:
5074; - vector element of dot product is converted from f16 to f32, and
5175; - the vectors are of type <2 x half>, and
5276; - "dot10-insts" is enabled
5377
54- ; GCN-LABEL: {{^}}dotproduct_f16_f32
55- ; GFX900: v_mad_mix_f32
56- ; GFX900: v_mad_mix_f32
57-
58- ; GFX906: v_mad_f32
59- ; GFX906: v_mac_f32_e32
78+ ; GCN-LABEL: {{^}}dotproduct_f16_f32_contract
6079
6180; GFX906-DL-UNSAFE: v_dot2_f32_f16
6281; GFX10-DL-UNSAFE: v_dot2c_f32_f16
@@ -65,6 +84,39 @@ entry:
6584
6685; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
6786; GFX906-DOT10-DISABLED: v_fma_mix_f32
87+ define amdgpu_kernel void @dotproduct_f16_f32_contract (ptr addrspace (1 ) %src1 ,
88+ ptr addrspace (1 ) %src2 ,
89+ ptr addrspace (1 ) nocapture %dst ) {
90+ entry:
91+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
92+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
93+
94+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
95+ %csrc1.el1 = fpext half %src1.el1 to float
96+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
97+ %csrc2.el1 = fpext half %src2.el1 to float
98+
99+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
100+ %csrc1.el2 = fpext half %src1.el2 to float
101+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
102+ %csrc2.el2 = fpext half %src2.el2 to float
103+
104+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el2
105+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el1
106+ %acc = load float , ptr addrspace (1 ) %dst , align 4
107+ %acc1 = fadd contract float %mul2 , %acc
108+ %acc2 = fadd contract float %mul1 , %acc1
109+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
110+ ret void
111+ }
112+
113+ ; GCN-LABEL: {{^}}dotproduct_f16_f32
114+ ; GFX900: v_mad_mix_f32
115+ ; GFX900: v_mad_mix_f32
116+
117+ ; GFX906: v_mad_f32
118+ ; GFX906: v_mac_f32_e32
119+
68120define amdgpu_kernel void @dotproduct_f16_f32 (ptr addrspace (1 ) %src1 ,
69121 ptr addrspace (1 ) %src2 ,
70122 ptr addrspace (1 ) nocapture %dst ) {
@@ -96,19 +148,46 @@ entry:
96148; - the vectors are of type <2 x half>, and
97149; - "dot10-insts" is enabled
98150
151+ ; GCN-LABEL: {{^}}dotproduct_diffvecorder_contract
152+ ; GFX906-DL-UNSAFE: v_dot2_f32_f16
153+ ; GFX10-DL-UNSAFE: v_dot2c_f32_f16
154+
155+ ; GFX906-CONTRACT: v_dot2_f32_f16
156+ ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
157+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
158+ define amdgpu_kernel void @dotproduct_diffvecorder_contract (ptr addrspace (1 ) %src1 ,
159+ ptr addrspace (1 ) %src2 ,
160+ ptr addrspace (1 ) nocapture %dst ) {
161+ entry:
162+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
163+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
164+
165+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
166+ %csrc1.el1 = fpext half %src1.el1 to float
167+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
168+ %csrc2.el1 = fpext half %src2.el1 to float
169+
170+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
171+ %csrc1.el2 = fpext half %src1.el2 to float
172+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
173+ %csrc2.el2 = fpext half %src2.el2 to float
174+
175+ %mul2 = fmul contract float %csrc2.el2 , %csrc1.el2
176+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el1
177+ %acc = load float , ptr addrspace (1 ) %dst , align 4
178+ %acc1 = fadd contract float %mul2 , %acc
179+ %acc2 = fadd contract float %mul1 , %acc1
180+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
181+ ret void
182+ }
183+
99184; GCN-LABEL: {{^}}dotproduct_diffvecorder
100185; GFX900: v_mad_mix_f32
101186; GFX900: v_mad_mix_f32
102187
103188; GFX906: v_mad_f32
104189; GFX906: v_mac_f32_e32
105190
106- ; GFX906-DL-UNSAFE: v_dot2_f32_f16
107- ; GFX10-DL-UNSAFE: v_dot2c_f32_f16
108-
109- ; GFX906-CONTRACT: v_dot2_f32_f16
110- ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
111- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
112191define amdgpu_kernel void @dotproduct_diffvecorder (ptr addrspace (1 ) %src1 ,
113192 ptr addrspace (1 ) %src2 ,
114193 ptr addrspace (1 ) nocapture %dst ) {
@@ -136,17 +215,45 @@ entry:
136215}
137216
138217; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
139- ; GCN-LABEL: {{^}}dotproduct_v4f16
140- ; GFX900: v_mad_mix_f32
141-
142- ; GFX906: v_mad_f32
143- ; GFX906: v_mac_f32_e32
218+ ; GCN-LABEL: {{^}}dotproduct_v4f16_contract
144219
145220; GCN-DL-UNSAFE: v_fma_mix_f32
146221
147222; GFX906-CONTRACT: v_fma_mix_f32
148223; GFX906-DENORM-CONTRACT: v_fma_mix_f32
149224; GFX906-DOT10-DISABLED: v_fma_mix_f32
225+ define amdgpu_kernel void @dotproduct_v4f16_contract (ptr addrspace (1 ) %src1 ,
226+ ptr addrspace (1 ) %src2 ,
227+ ptr addrspace (1 ) nocapture %dst ) {
228+ entry:
229+ %src1.vec = load <4 x half >, ptr addrspace (1 ) %src1
230+ %src2.vec = load <4 x half >, ptr addrspace (1 ) %src2
231+
232+ %src1.el1 = extractelement <4 x half > %src1.vec , i64 0
233+ %csrc1.el1 = fpext half %src1.el1 to float
234+ %src2.el1 = extractelement <4 x half > %src2.vec , i64 0
235+ %csrc2.el1 = fpext half %src2.el1 to float
236+
237+ %src1.el2 = extractelement <4 x half > %src1.vec , i64 1
238+ %csrc1.el2 = fpext half %src1.el2 to float
239+ %src2.el2 = extractelement <4 x half > %src2.vec , i64 1
240+ %csrc2.el2 = fpext half %src2.el2 to float
241+
242+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el2
243+ %mul1 = fmul float %csrc1.el1 , %csrc2.el1
244+ %acc = load float , ptr addrspace (1 ) %dst , align 4
245+ %acc1 = fadd contract float %mul2 , %acc
246+ %acc2 = fadd contract float %mul1 , %acc1
247+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
248+ ret void
249+ }
250+
251+ ; GCN-LABEL: {{^}}dotproduct_v4f16
252+ ; GFX900: v_mad_mix_f32
253+
254+ ; GFX906: v_mad_f32
255+ ; GFX906: v_mac_f32_e32
256+
150257define amdgpu_kernel void @dotproduct_v4f16 (ptr addrspace (1 ) %src1 ,
151258 ptr addrspace (1 ) %src2 ,
152259 ptr addrspace (1 ) nocapture %dst ) {
@@ -173,18 +280,46 @@ entry:
173280 ret void
174281}
175282
283+ ; GCN-LABEL: {{^}}NotAdotproductContract
284+
285+ ; GCN-DL-UNSAFE: v_fma_mix_f32
286+
287+ ; GFX906-CONTRACT: v_fma_mix_f32
288+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
289+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
290+ define amdgpu_kernel void @NotAdotproductContract (ptr addrspace (1 ) %src1 ,
291+ ptr addrspace (1 ) %src2 ,
292+ ptr addrspace (1 ) nocapture %dst ) {
293+ entry:
294+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
295+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
296+
297+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
298+ %csrc1.el1 = fpext half %src1.el1 to float
299+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
300+ %csrc2.el1 = fpext half %src2.el1 to float
301+
302+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
303+ %csrc1.el2 = fpext half %src1.el2 to float
304+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
305+ %csrc2.el2 = fpext half %src2.el2 to float
306+
307+ %mul2 = fmul contract float %csrc1.el2 , %csrc1.el1
308+ %mul1 = fmul contract float %csrc2.el1 , %csrc2.el2
309+ %acc = load float , ptr addrspace (1 ) %dst , align 4
310+ %acc1 = fadd contract float %mul2 , %acc
311+ %acc2 = fadd contract float %mul1 , %acc1
312+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
313+ ret void
314+ }
315+
176316; GCN-LABEL: {{^}}NotAdotproduct
177317; GFX900: v_mad_mix_f32
178318; GFX900: v_mad_mix_f32
179319
180320; GFX906: v_mad_f32
181321; GFX906: v_mac_f32_e32
182322
183- ; GCN-DL-UNSAFE: v_fma_mix_f32
184-
185- ; GFX906-CONTRACT: v_fma_mix_f32
186- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
187- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
188323define amdgpu_kernel void @NotAdotproduct (ptr addrspace (1 ) %src1 ,
189324 ptr addrspace (1 ) %src2 ,
190325 ptr addrspace (1 ) nocapture %dst ) {
@@ -211,18 +346,46 @@ entry:
211346 ret void
212347}
213348
349+ ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproductContract
350+
351+ ; GCN-DL-UNSAFE: v_fma_mix_f32
352+
353+ ; GFX906-CONTRACT: v_fma_mix_f32
354+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
355+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
356+ define amdgpu_kernel void @Diff_Idx_NotAdotproductContract (ptr addrspace (1 ) %src1 ,
357+ ptr addrspace (1 ) %src2 ,
358+ ptr addrspace (1 ) nocapture %dst ) {
359+ entry:
360+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
361+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
362+
363+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
364+ %csrc1.el1 = fpext half %src1.el1 to float
365+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
366+ %csrc2.el1 = fpext half %src2.el1 to float
367+
368+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
369+ %csrc1.el2 = fpext half %src1.el2 to float
370+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
371+ %csrc2.el2 = fpext half %src2.el2 to float
372+
373+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el1
374+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el2
375+ %acc = load float , ptr addrspace (1 ) %dst , align 4
376+ %acc1 = fadd contract float %mul2 , %acc
377+ %acc2 = fadd contract float %mul1 , %acc1
378+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
379+ ret void
380+ }
381+
214382; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
215383; GFX900: v_mad_mix_f32
216384; GFX900: v_mad_mix_f32
217385
218386; GFX906: v_mad_f32
219387; GFX906: v_mac_f32_e32
220388
221- ; GCN-DL-UNSAFE: v_fma_mix_f32
222-
223- ; GFX906-CONTRACT: v_fma_mix_f32
224- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
225- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
226389define amdgpu_kernel void @Diff_Idx_NotAdotproduct (ptr addrspace (1 ) %src1 ,
227390 ptr addrspace (1 ) %src2 ,
228391 ptr addrspace (1 ) nocapture %dst ) {
0 commit comments