1- ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900 
2- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE 
3- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 
4- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 
1+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900 
2+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE 
3+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 
4+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 
55; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906 
66; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT 
77; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT 
8- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED 
8+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED 
99; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) 
1010
1111; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions 
1212; are not converted from f16 to f32. 
13- ; GCN-LABEL: {{^}}dotproduct_f16  
13+ ; GCN-LABEL: {{^}}dotproduct_f16_contract  
1414; GFX900: v_fma_f16 
1515; GFX900: v_fma_f16 
1616
17- ; GFX906: v_mul_f16_e32 
18- ; GFX906: v_mul_f16_e32 
19- 
2017; GFX906-DL-UNSAFE:  v_fma_f16 
2118; GFX10-CONTRACT: v_fmac_f16 
2219
2320; GFX906-CONTRACT: v_mac_f16_e32 
2421; GFX906-DENORM-CONTRACT: v_fma_f16 
2522; GFX906-DOT10-DISABLED: v_fma_f16 
23+ 
24+ define  amdgpu_kernel void  @dotproduct_f16_contract (ptr  addrspace (1 ) %src1 ,
25+                                                    ptr  addrspace (1 ) %src2 ,
26+                                                    ptr  addrspace (1 ) nocapture  %dst ) {
27+ entry:
28+   %src1.vec  = load  <2  x half >, ptr  addrspace (1 ) %src1 
29+   %src2.vec  = load  <2  x half >, ptr  addrspace (1 ) %src2 
30+ 
31+   %src1.el1  = extractelement  <2  x half > %src1.vec , i64  0 
32+   %src2.el1  = extractelement  <2  x half > %src2.vec , i64  0 
33+ 
34+   %src1.el2  = extractelement  <2  x half > %src1.vec , i64  1 
35+   %src2.el2  = extractelement  <2  x half > %src2.vec , i64  1 
36+ 
37+   %mul2  = fmul  fast half  %src1.el2 , %src2.el2 
38+   %mul1  = fmul  fast half  %src1.el1 , %src2.el1 
39+   %acc  = load  half , ptr  addrspace (1 ) %dst , align  2 
40+   %acc1  = fadd  fast half  %mul2 , %acc 
41+   %acc2  = fadd  fast half  %mul1 , %acc1 
42+   store  half  %acc2 , ptr  addrspace (1 ) %dst , align  2 
43+   ret  void 
44+ }
45+ 
46+ ; GCN-LABEL: {{^}}dotproduct_f16 
47+ 
48+ ; GFX906: v_mul_f16_e32 
49+ ; GFX906: v_mul_f16_e32 
50+ 
2651define  amdgpu_kernel void  @dotproduct_f16 (ptr  addrspace (1 ) %src1 ,
2752                                          ptr  addrspace (1 ) %src2 ,
2853                                          ptr  addrspace (1 ) nocapture  %dst ) {
@@ -45,18 +70,12 @@ entry:
4570  ret  void 
4671}
4772
48- 
4973; We only want to generate fdot2 if: 
5074; - vector element of dot product is converted from f16 to f32, and 
5175; - the vectors are of type <2 x half>, and 
5276; - "dot10-insts" is enabled 
5377
54- ; GCN-LABEL: {{^}}dotproduct_f16_f32 
55- ; GFX900: v_mad_mix_f32 
56- ; GFX900: v_mad_mix_f32 
57- 
58- ; GFX906: v_mad_f32 
59- ; GFX906: v_mac_f32_e32 
78+ ; GCN-LABEL: {{^}}dotproduct_f16_f32_contract 
6079
6180; GFX906-DL-UNSAFE: v_dot2_f32_f16 
6281; GFX10-DL-UNSAFE: v_dot2c_f32_f16 
@@ -65,6 +84,39 @@ entry:
6584
6685; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 
6786; GFX906-DOT10-DISABLED: v_fma_mix_f32 
87+ define  amdgpu_kernel void  @dotproduct_f16_f32_contract (ptr  addrspace (1 ) %src1 ,
88+                                                        ptr  addrspace (1 ) %src2 ,
89+                                                        ptr  addrspace (1 ) nocapture  %dst ) {
90+ entry:
91+   %src1.vec  = load  <2  x half >, ptr  addrspace (1 ) %src1 
92+   %src2.vec  = load  <2  x half >, ptr  addrspace (1 ) %src2 
93+ 
94+   %src1.el1  = extractelement  <2  x half > %src1.vec , i64  0 
95+   %csrc1.el1  = fpext  half  %src1.el1  to  float 
96+   %src2.el1  = extractelement  <2  x half > %src2.vec , i64  0 
97+   %csrc2.el1  = fpext  half  %src2.el1  to  float 
98+ 
99+   %src1.el2  = extractelement  <2  x half > %src1.vec , i64  1 
100+   %csrc1.el2  = fpext  half  %src1.el2  to  float 
101+   %src2.el2  = extractelement  <2  x half > %src2.vec , i64  1 
102+   %csrc2.el2  = fpext  half  %src2.el2  to  float 
103+ 
104+   %mul2  = fmul  fast float  %csrc1.el2 , %csrc2.el2 
105+   %mul1  = fmul  fast float  %csrc1.el1 , %csrc2.el1 
106+   %acc  = load  float , ptr  addrspace (1 ) %dst , align  4 
107+   %acc1  = fadd  fast float  %mul2 , %acc 
108+   %acc2  = fadd  fast float  %mul1 , %acc1 
109+   store  float  %acc2 , ptr  addrspace (1 ) %dst , align  4 
110+   ret  void 
111+ }
112+ 
113+ ; GCN-LABEL: {{^}}dotproduct_f16_f32 
114+ ; GFX900: v_mad_mix_f32 
115+ ; GFX900: v_mad_mix_f32 
116+ 
117+ ; GFX906: v_mad_f32 
118+ ; GFX906: v_mac_f32_e32 
119+ 
68120define  amdgpu_kernel void  @dotproduct_f16_f32 (ptr  addrspace (1 ) %src1 ,
69121                                              ptr  addrspace (1 ) %src2 ,
70122                                              ptr  addrspace (1 ) nocapture  %dst ) {
@@ -96,19 +148,46 @@ entry:
96148; - the vectors are of type <2 x half>, and 
97149; - "dot10-insts" is enabled 
98150
151+ ; GCN-LABEL: {{^}}dotproduct_diffvecorder_contract 
152+ ; GFX906-DL-UNSAFE: v_dot2_f32_f16 
153+ ; GFX10-DL-UNSAFE: v_dot2c_f32_f16 
154+ 
155+ ; GFX906-CONTRACT: v_dot2_f32_f16 
156+ ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 
157+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
158+ define  amdgpu_kernel void  @dotproduct_diffvecorder_contract (ptr  addrspace (1 ) %src1 ,
159+                                                             ptr  addrspace (1 ) %src2 ,
160+                                                             ptr  addrspace (1 ) nocapture  %dst ) {
161+ entry:
162+   %src1.vec  = load  <2  x half >, ptr  addrspace (1 ) %src1 
163+   %src2.vec  = load  <2  x half >, ptr  addrspace (1 ) %src2 
164+ 
165+   %src1.el1  = extractelement  <2  x half > %src1.vec , i64  0 
166+   %csrc1.el1  = fpext  half  %src1.el1  to  float 
167+   %src2.el1  = extractelement  <2  x half > %src2.vec , i64  0 
168+   %csrc2.el1  = fpext  half  %src2.el1  to  float 
169+ 
170+   %src1.el2  = extractelement  <2  x half > %src1.vec , i64  1 
171+   %csrc1.el2  = fpext  half  %src1.el2  to  float 
172+   %src2.el2  = extractelement  <2  x half > %src2.vec , i64  1 
173+   %csrc2.el2  = fpext  half  %src2.el2  to  float 
174+ 
175+   %mul2  = fmul  fast float  %csrc2.el2 , %csrc1.el2 
176+   %mul1  = fmul  fast float  %csrc1.el1 , %csrc2.el1 
177+   %acc  = load  float , ptr  addrspace (1 ) %dst , align  4 
178+   %acc1  = fadd  fast float  %mul2 , %acc 
179+   %acc2  = fadd  fast float  %mul1 , %acc1 
180+   store  float  %acc2 , ptr  addrspace (1 ) %dst , align  4 
181+   ret  void 
182+ }
183+ 
99184; GCN-LABEL: {{^}}dotproduct_diffvecorder 
100185; GFX900: v_mad_mix_f32 
101186; GFX900: v_mad_mix_f32 
102187
103188; GFX906: v_mad_f32 
104189; GFX906: v_mac_f32_e32 
105190
106- ; GFX906-DL-UNSAFE: v_dot2_f32_f16 
107- ; GFX10-DL-UNSAFE: v_dot2c_f32_f16 
108- 
109- ; GFX906-CONTRACT: v_dot2_f32_f16 
110- ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 
111- ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
112191define  amdgpu_kernel void  @dotproduct_diffvecorder (ptr  addrspace (1 ) %src1 ,
113192                                                   ptr  addrspace (1 ) %src2 ,
114193                                                   ptr  addrspace (1 ) nocapture  %dst ) {
@@ -136,17 +215,45 @@ entry:
136215}
137216
138217; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. 
139- ; GCN-LABEL: {{^}}dotproduct_v4f16 
140- ; GFX900: v_mad_mix_f32 
141- 
142- ; GFX906: v_mad_f32 
143- ; GFX906: v_mac_f32_e32 
218+ ; GCN-LABEL: {{^}}dotproduct_v4f16_contract 
144219
145220; GCN-DL-UNSAFE: v_fma_mix_f32 
146221
147222; GFX906-CONTRACT: v_fma_mix_f32 
148223; GFX906-DENORM-CONTRACT: v_fma_mix_f32 
149224; GFX906-DOT10-DISABLED: v_fma_mix_f32 
225+ define  amdgpu_kernel void  @dotproduct_v4f16_contract (ptr  addrspace (1 ) %src1 ,
226+                                                      ptr  addrspace (1 ) %src2 ,
227+                                                      ptr  addrspace (1 ) nocapture  %dst ) {
228+ entry:
229+   %src1.vec  = load  <4  x half >, ptr  addrspace (1 ) %src1 
230+   %src2.vec  = load  <4  x half >, ptr  addrspace (1 ) %src2 
231+ 
232+   %src1.el1  = extractelement  <4  x half > %src1.vec , i64  0 
233+   %csrc1.el1  = fpext  half  %src1.el1  to  float 
234+   %src2.el1  = extractelement  <4  x half > %src2.vec , i64  0 
235+   %csrc2.el1  = fpext  half  %src2.el1  to  float 
236+ 
237+   %src1.el2  = extractelement  <4  x half > %src1.vec , i64  1 
238+   %csrc1.el2  = fpext  half  %src1.el2  to  float 
239+   %src2.el2  = extractelement  <4  x half > %src2.vec , i64  1 
240+   %csrc2.el2  = fpext  half  %src2.el2  to  float 
241+ 
242+   %mul2  = fmul  fast float  %csrc1.el2 , %csrc2.el2 
243+   %mul1  = fmul  float  %csrc1.el1 , %csrc2.el1 
244+   %acc  = load  float , ptr  addrspace (1 ) %dst , align  4 
245+   %acc1  = fadd  fast float  %mul2 , %acc 
246+   %acc2  = fadd  fast float  %mul1 , %acc1 
247+   store  float  %acc2 , ptr  addrspace (1 ) %dst , align  4 
248+   ret  void 
249+ }
250+ 
251+ ; GCN-LABEL: {{^}}dotproduct_v4f16 
252+ ; GFX900: v_mad_mix_f32 
253+ 
254+ ; GFX906: v_mad_f32 
255+ ; GFX906: v_mac_f32_e32 
256+ 
150257define  amdgpu_kernel void  @dotproduct_v4f16 (ptr  addrspace (1 ) %src1 ,
151258                                            ptr  addrspace (1 ) %src2 ,
152259                                            ptr  addrspace (1 ) nocapture  %dst ) {
@@ -173,18 +280,46 @@ entry:
173280  ret  void 
174281}
175282
283+ ; GCN-LABEL: {{^}}NotAdotproductContract 
284+ 
285+ ; GCN-DL-UNSAFE: v_fma_mix_f32 
286+ 
287+ ; GFX906-CONTRACT: v_fma_mix_f32 
288+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 
289+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
290+ define  amdgpu_kernel void  @NotAdotproductContract (ptr  addrspace (1 ) %src1 ,
291+                                                   ptr  addrspace (1 ) %src2 ,
292+                                                   ptr  addrspace (1 ) nocapture  %dst ) {
293+ entry:
294+   %src1.vec  = load  <2  x half >, ptr  addrspace (1 ) %src1 
295+   %src2.vec  = load  <2  x half >, ptr  addrspace (1 ) %src2 
296+ 
297+   %src1.el1  = extractelement  <2  x half > %src1.vec , i64  0 
298+   %csrc1.el1  = fpext  half  %src1.el1  to  float 
299+   %src2.el1  = extractelement  <2  x half > %src2.vec , i64  0 
300+   %csrc2.el1  = fpext  half  %src2.el1  to  float 
301+ 
302+   %src1.el2  = extractelement  <2  x half > %src1.vec , i64  1 
303+   %csrc1.el2  = fpext  half  %src1.el2  to  float 
304+   %src2.el2  = extractelement  <2  x half > %src2.vec , i64  1 
305+   %csrc2.el2  = fpext  half  %src2.el2  to  float 
306+ 
307+   %mul2  = fmul  fast float  %csrc1.el2 , %csrc1.el1 
308+   %mul1  = fmul  fast float  %csrc2.el1 , %csrc2.el2 
309+   %acc  = load  float , ptr  addrspace (1 ) %dst , align  4 
310+   %acc1  = fadd  fast float  %mul2 , %acc 
311+   %acc2  = fadd  fast float  %mul1 , %acc1 
312+   store  float  %acc2 , ptr  addrspace (1 ) %dst , align  4 
313+   ret  void 
314+ }
315+ 
176316; GCN-LABEL: {{^}}NotAdotproduct 
177317; GFX900: v_mad_mix_f32 
178318; GFX900: v_mad_mix_f32 
179319
180320; GFX906: v_mad_f32 
181321; GFX906: v_mac_f32_e32 
182322
183- ; GCN-DL-UNSAFE: v_fma_mix_f32 
184- 
185- ; GFX906-CONTRACT: v_fma_mix_f32 
186- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 
187- ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
188323define  amdgpu_kernel void  @NotAdotproduct (ptr  addrspace (1 ) %src1 ,
189324                                          ptr  addrspace (1 ) %src2 ,
190325                                          ptr  addrspace (1 ) nocapture  %dst ) {
@@ -211,18 +346,46 @@ entry:
211346  ret  void 
212347}
213348
349+ ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproductContract 
350+ 
351+ ; GCN-DL-UNSAFE: v_fma_mix_f32 
352+ 
353+ ; GFX906-CONTRACT: v_fma_mix_f32 
354+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 
355+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
356+ define  amdgpu_kernel void  @Diff_Idx_NotAdotproductContract (ptr  addrspace (1 ) %src1 ,
357+                                                            ptr  addrspace (1 ) %src2 ,
358+                                                            ptr  addrspace (1 ) nocapture  %dst ) {
359+ entry:
360+   %src1.vec  = load  <2  x half >, ptr  addrspace (1 ) %src1 
361+   %src2.vec  = load  <2  x half >, ptr  addrspace (1 ) %src2 
362+ 
363+   %src1.el1  = extractelement  <2  x half > %src1.vec , i64  0 
364+   %csrc1.el1  = fpext  half  %src1.el1  to  float 
365+   %src2.el1  = extractelement  <2  x half > %src2.vec , i64  0 
366+   %csrc2.el1  = fpext  half  %src2.el1  to  float 
367+ 
368+   %src1.el2  = extractelement  <2  x half > %src1.vec , i64  1 
369+   %csrc1.el2  = fpext  half  %src1.el2  to  float 
370+   %src2.el2  = extractelement  <2  x half > %src2.vec , i64  1 
371+   %csrc2.el2  = fpext  half  %src2.el2  to  float 
372+ 
373+   %mul2  = fmul  fast float  %csrc1.el2 , %csrc2.el1 
374+   %mul1  = fmul  fast float  %csrc1.el1 , %csrc2.el2 
375+   %acc  = load  float , ptr  addrspace (1 ) %dst , align  4 
376+   %acc1  = fadd  fast float  %mul2 , %acc 
377+   %acc2  = fadd  fast float  %mul1 , %acc1 
378+   store  float  %acc2 , ptr  addrspace (1 ) %dst , align  4 
379+   ret  void 
380+ }
381+ 
214382; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct 
215383; GFX900: v_mad_mix_f32 
216384; GFX900: v_mad_mix_f32 
217385
218386; GFX906: v_mad_f32 
219387; GFX906: v_mac_f32_e32 
220388
221- ; GCN-DL-UNSAFE: v_fma_mix_f32 
222- 
223- ; GFX906-CONTRACT: v_fma_mix_f32 
224- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 
225- ; GFX906-DOT10-DISABLED: v_fma_mix_f32 
226389define  amdgpu_kernel void  @Diff_Idx_NotAdotproduct (ptr  addrspace (1 ) %src1 ,
227390                                                   ptr  addrspace (1 ) %src2 ,
228391                                                   ptr  addrspace (1 ) nocapture  %dst ) {
0 commit comments