1
+ /***************************************************************************
2
+ Copyright (c) 2014, The OpenBLAS Project
3
+ All rights reserved.
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+ 1. Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ 2. Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in
11
+ the documentation and/or other materials provided with the
12
+ distribution.
13
+ 3. Neither the name of the OpenBLAS project nor the names of
14
+ its contributors may be used to endorse or promote products
15
+ derived from this software without specific prior written permission.
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25
+ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *****************************************************************************/
27
+
28
+
29
+ /* need a new enough GCC for avx512 support */
30
+ #if (( defined(__GNUC__ ) && __GNUC__ > 6 && defined(__AVX2__ )) || (defined(__clang__ ) && __clang_major__ >= 6 ))
31
+
32
+ #include <immintrin.h>
33
+
34
+ #define HAVE_KERNEL_4x4 1
35
+
36
+ static void dsymv_kernel_4x4 (BLASLONG from , BLASLONG to , FLOAT * * a , FLOAT * x , FLOAT * y , FLOAT * temp1 , FLOAT * temp2 )
37
+ {
38
+
39
+
40
+ __m256d accum_0 , accum_1 , accum_2 , accum_3 ;
41
+ __m256d temp1_0 , temp1_1 , temp1_2 , temp1_3 ;
42
+
43
+ /* the 256 bit wide acculmulator vectors start out as zero */
44
+ accum_0 = _mm256_setzero_pd ();
45
+ accum_1 = _mm256_setzero_pd ();
46
+ accum_2 = _mm256_setzero_pd ();
47
+ accum_3 = _mm256_setzero_pd ();
48
+
49
+ temp1_0 = _mm256_broadcastsd_pd (_mm_load_sd (& temp1 [0 ]));
50
+ temp1_1 = _mm256_broadcastsd_pd (_mm_load_sd (& temp1 [1 ]));
51
+ temp1_2 = _mm256_broadcastsd_pd (_mm_load_sd (& temp1 [2 ]));
52
+ temp1_3 = _mm256_broadcastsd_pd (_mm_load_sd (& temp1 [3 ]));
53
+
54
+ #ifdef __AVX512CD__
55
+ __m512d accum_05 , accum_15 , accum_25 , accum_35 ;
56
+ __m512d temp1_05 , temp1_15 , temp1_25 , temp1_35 ;
57
+ BLASLONG to2 ;
58
+ int delta ;
59
+
60
+ /* the 512 bit wide accumulator vectors start out as zero */
61
+ accum_05 = _mm512_setzero_pd ();
62
+ accum_15 = _mm512_setzero_pd ();
63
+ accum_25 = _mm512_setzero_pd ();
64
+ accum_35 = _mm512_setzero_pd ();
65
+
66
+ temp1_05 = _mm512_broadcastsd_pd (_mm_load_sd (& temp1 [0 ]));
67
+ temp1_15 = _mm512_broadcastsd_pd (_mm_load_sd (& temp1 [1 ]));
68
+ temp1_25 = _mm512_broadcastsd_pd (_mm_load_sd (& temp1 [2 ]));
69
+ temp1_35 = _mm512_broadcastsd_pd (_mm_load_sd (& temp1 [3 ]));
70
+
71
+ delta = (to - from ) & ~7 ;
72
+ to2 = from + delta ;
73
+
74
+
75
+ for (; from < to2 ; from += 8 ) {
76
+ __m512d _x , _y ;
77
+ __m512d a0 , a1 , a2 , a3 ;
78
+
79
+ _y = _mm512_loadu_pd (& y [from ]);
80
+ _x = _mm512_loadu_pd (& x [from ]);
81
+
82
+ a0 = _mm512_loadu_pd (& a [0 ][from ]);
83
+ a1 = _mm512_loadu_pd (& a [1 ][from ]);
84
+ a2 = _mm512_loadu_pd (& a [2 ][from ]);
85
+ a3 = _mm512_loadu_pd (& a [3 ][from ]);
86
+
87
+ _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3 ;
88
+
89
+ accum_05 += _x * a0 ;
90
+ accum_15 += _x * a1 ;
91
+ accum_25 += _x * a2 ;
92
+ accum_35 += _x * a3 ;
93
+
94
+ _mm512_storeu_pd (& y [from ], _y );
95
+
96
+ };
97
+
98
+ /*
99
+ * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
100
+ * below can continue using the intermediate results in its loop
101
+ */
102
+ accum_0 = _mm256_add_pd (_mm512_extractf64x4_pd (accum_05 , 0 ), _mm512_extractf64x4_pd (accum_05 , 1 ));
103
+ accum_1 = _mm256_add_pd (_mm512_extractf64x4_pd (accum_15 , 0 ), _mm512_extractf64x4_pd (accum_15 , 1 ));
104
+ accum_2 = _mm256_add_pd (_mm512_extractf64x4_pd (accum_25 , 0 ), _mm512_extractf64x4_pd (accum_25 , 1 ));
105
+ accum_3 = _mm256_add_pd (_mm512_extractf64x4_pd (accum_35 , 0 ), _mm512_extractf64x4_pd (accum_35 , 1 ));
106
+
107
+ #endif
108
+
109
+ for (; from != to ; from += 4 ) {
110
+ __m256d _x , _y ;
111
+ __m256d a0 , a1 , a2 , a3 ;
112
+
113
+ _y = _mm256_loadu_pd (& y [from ]);
114
+ _x = _mm256_loadu_pd (& x [from ]);
115
+
116
+ /* load 4 rows of matrix data */
117
+ a0 = _mm256_loadu_pd (& a [0 ][from ]);
118
+ a1 = _mm256_loadu_pd (& a [1 ][from ]);
119
+ a2 = _mm256_loadu_pd (& a [2 ][from ]);
120
+ a3 = _mm256_loadu_pd (& a [3 ][from ]);
121
+
122
+ _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3 ;
123
+
124
+ accum_0 += _x * a0 ;
125
+ accum_1 += _x * a1 ;
126
+ accum_2 += _x * a2 ;
127
+ accum_3 += _x * a3 ;
128
+
129
+ _mm256_storeu_pd (& y [from ], _y );
130
+
131
+ };
132
+
133
+ /*
134
+ * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
135
+ * output array. There is no direct instruction for this in 256 bit space, only in 128 space.
136
+ */
137
+
138
+ __m128d half_accum0 , half_accum1 , half_accum2 , half_accum3 ;
139
+
140
+
141
+ /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
142
+ half_accum0 = _mm_add_pd (_mm256_extractf128_pd (accum_0 , 0 ), _mm256_extractf128_pd (accum_0 , 1 ));
143
+ half_accum1 = _mm_add_pd (_mm256_extractf128_pd (accum_1 , 0 ), _mm256_extractf128_pd (accum_1 , 1 ));
144
+ half_accum2 = _mm_add_pd (_mm256_extractf128_pd (accum_2 , 0 ), _mm256_extractf128_pd (accum_2 , 1 ));
145
+ half_accum3 = _mm_add_pd (_mm256_extractf128_pd (accum_3 , 0 ), _mm256_extractf128_pd (accum_3 , 1 ));
146
+
147
+ /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
148
+ half_accum0 = _mm_hadd_pd (half_accum0 , half_accum0 );
149
+ half_accum1 = _mm_hadd_pd (half_accum1 , half_accum1 );
150
+ half_accum2 = _mm_hadd_pd (half_accum2 , half_accum2 );
151
+ half_accum3 = _mm_hadd_pd (half_accum3 , half_accum3 );
152
+
153
+ /* and store the lowest double value from each of these vectors in the temp2 output */
154
+ temp2 [0 ] += half_accum0 [0 ];
155
+ temp2 [1 ] += half_accum1 [0 ];
156
+ temp2 [2 ] += half_accum2 [0 ];
157
+ temp2 [3 ] += half_accum3 [0 ];
158
+ }
159
+ #else
160
+ #include "dsymv_L_microk_haswell-2.c"
161
+ #endif
0 commit comments