|
38 | 38 |
|
39 | 39 | #include <stdio.h>
|
40 | 40 | #include "common.h"
|
| 41 | +#include <immintrin.h> |
41 | 42 |
|
42 | 43 | int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){
|
43 | 44 | BLASLONG i, j;
|
@@ -84,131 +85,129 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
|
84 | 85 | i = (m >> 3);
|
85 | 86 | if (i > 0){
|
86 | 87 | do{
|
87 |
| - ctemp01 = *(aoffset1 + 0); |
88 |
| - ctemp02 = *(aoffset1 + 1); |
89 |
| - ctemp03 = *(aoffset1 + 2); |
90 |
| - ctemp04 = *(aoffset1 + 3); |
91 |
| - ctemp05 = *(aoffset1 + 4); |
92 |
| - ctemp06 = *(aoffset1 + 5); |
| 88 | + __m128d xmm0, xmm1; |
| 89 | + xmm0 = _mm_load_pd1(aoffset2 + 0); |
| 90 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 0); |
| 91 | + _mm_storeu_pd(boffset + 0, xmm0); |
| 92 | + |
93 | 93 | ctemp07 = *(aoffset1 + 6);
|
94 | 94 | ctemp08 = *(aoffset1 + 7);
|
95 | 95 |
|
96 |
| - ctemp09 = *(aoffset2 + 0); |
97 |
| - ctemp10 = *(aoffset2 + 1); |
98 |
| - ctemp11 = *(aoffset2 + 2); |
99 |
| - ctemp12 = *(aoffset2 + 3); |
100 |
| - ctemp13 = *(aoffset2 + 4); |
101 |
| - ctemp14 = *(aoffset2 + 5); |
| 96 | + xmm1 = _mm_load_pd1(aoffset4 + 0); |
| 97 | + xmm1 = _mm_loadl_pd(xmm1, aoffset3 + 0); |
| 98 | + _mm_storeu_pd(boffset + 2, xmm1); |
| 99 | + |
| 100 | + xmm0 = _mm_load_pd1(aoffset6 + 0); |
| 101 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 0); |
| 102 | + _mm_storeu_pd(boffset + 4, xmm0); |
| 103 | + |
| 104 | + xmm0 = _mm_load_pd1(aoffset8 + 0); |
| 105 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 0); |
| 106 | + _mm_storeu_pd(boffset + 6, xmm0); |
| 107 | + |
102 | 108 | ctemp15 = *(aoffset2 + 6);
|
103 | 109 | ctemp16 = *(aoffset2 + 7);
|
104 | 110 |
|
105 |
| - ctemp17 = *(aoffset3 + 0); |
106 |
| - ctemp18 = *(aoffset3 + 1); |
107 |
| - ctemp19 = *(aoffset3 + 2); |
108 |
| - ctemp20 = *(aoffset3 + 3); |
109 |
| - ctemp21 = *(aoffset3 + 4); |
110 |
| - ctemp22 = *(aoffset3 + 5); |
| 111 | + xmm0 = _mm_load_pd1(aoffset2 + 1); |
| 112 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 1); |
| 113 | + _mm_storeu_pd(boffset + 8, xmm0); |
| 114 | + |
| 115 | + xmm0 = _mm_load_pd1(aoffset4 + 1); |
| 116 | + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 1); |
| 117 | + _mm_storeu_pd(boffset + 10, xmm0); |
| 118 | + |
| 119 | + xmm0 = _mm_load_pd1(aoffset6 + 1); |
| 120 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 1); |
| 121 | + _mm_storeu_pd(boffset + 12, xmm0); |
| 122 | + |
| 123 | + xmm0 = _mm_load_pd1(aoffset8 + 1); |
| 124 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 1); |
| 125 | + _mm_storeu_pd(boffset + 14, xmm0); |
| 126 | + |
| 127 | + xmm0 = _mm_load_pd1(aoffset2 + 2); |
| 128 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 2); |
| 129 | + _mm_storeu_pd(boffset + 16, xmm0); |
| 130 | + |
| 131 | + xmm0 = _mm_load_pd1(aoffset4 + 2); |
| 132 | + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 2); |
| 133 | + _mm_storeu_pd(boffset + 18, xmm0); |
| 134 | + |
| 135 | + xmm0 = _mm_load_pd1(aoffset6 + 2); |
| 136 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 2); |
| 137 | + _mm_storeu_pd(boffset + 20, xmm0); |
| 138 | + |
| 139 | + xmm0 = _mm_load_pd1(aoffset8 + 2); |
| 140 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 2); |
| 141 | + _mm_storeu_pd(boffset + 22, xmm0); |
| 142 | + |
111 | 143 | ctemp23 = *(aoffset3 + 6);
|
112 | 144 | ctemp24 = *(aoffset3 + 7);
|
113 | 145 |
|
114 |
| - ctemp25 = *(aoffset4 + 0); |
115 |
| - ctemp26 = *(aoffset4 + 1); |
116 |
| - ctemp27 = *(aoffset4 + 2); |
117 |
| - ctemp28 = *(aoffset4 + 3); |
118 |
| - ctemp29 = *(aoffset4 + 4); |
119 |
| - ctemp30 = *(aoffset4 + 5); |
| 146 | + xmm0 = _mm_load_pd1(aoffset2 + 3); |
| 147 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 3); |
| 148 | + _mm_storeu_pd(boffset + 24, xmm0); |
| 149 | + |
| 150 | + xmm0 = _mm_load_pd1(aoffset4 + 3); |
| 151 | + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 3); |
| 152 | + _mm_storeu_pd(boffset + 26, xmm0); |
| 153 | + |
| 154 | + xmm0 = _mm_load_pd1(aoffset6 + 3); |
| 155 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 3); |
| 156 | + _mm_storeu_pd(boffset + 28, xmm0); |
| 157 | + |
| 158 | + xmm0 = _mm_load_pd1(aoffset8 + 3); |
| 159 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 3); |
| 160 | + _mm_storeu_pd(boffset + 30, xmm0); |
| 161 | + |
120 | 162 | ctemp31 = *(aoffset4 + 6);
|
121 | 163 | ctemp32 = *(aoffset4 + 7);
|
122 | 164 |
|
123 |
| - ctemp33 = *(aoffset5 + 0); |
124 |
| - ctemp34 = *(aoffset5 + 1); |
125 |
| - ctemp35 = *(aoffset5 + 2); |
126 |
| - ctemp36 = *(aoffset5 + 3); |
127 |
| - ctemp37 = *(aoffset5 + 4); |
128 |
| - ctemp38 = *(aoffset5 + 5); |
| 165 | + |
| 166 | + xmm0 = _mm_load_pd1(aoffset2 + 4); |
| 167 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 4); |
| 168 | + _mm_storeu_pd(boffset + 32, xmm0); |
| 169 | + |
| 170 | + xmm0 = _mm_load_pd1(aoffset4 + 4); |
| 171 | + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 4); |
| 172 | + _mm_storeu_pd(boffset + 34, xmm0); |
| 173 | + |
| 174 | + xmm0 = _mm_load_pd1(aoffset6 + 4); |
| 175 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 4); |
| 176 | + _mm_storeu_pd(boffset + 36, xmm0); |
| 177 | + |
| 178 | + xmm0 = _mm_load_pd1(aoffset8 + 4); |
| 179 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 4); |
| 180 | + _mm_storeu_pd(boffset + 38, xmm0); |
| 181 | + |
129 | 182 | ctemp39 = *(aoffset5 + 6);
|
130 | 183 | ctemp40 = *(aoffset5 + 7);
|
131 | 184 |
|
132 |
| - ctemp41 = *(aoffset6 + 0); |
133 |
| - ctemp42 = *(aoffset6 + 1); |
134 |
| - ctemp43 = *(aoffset6 + 2); |
135 |
| - ctemp44 = *(aoffset6 + 3); |
136 |
| - ctemp45 = *(aoffset6 + 4); |
137 |
| - ctemp46 = *(aoffset6 + 5); |
| 185 | + xmm0 = _mm_load_pd1(aoffset2 + 5); |
| 186 | + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 5); |
| 187 | + _mm_storeu_pd(boffset + 40, xmm0); |
| 188 | + |
| 189 | + xmm0 = _mm_load_pd1(aoffset4 + 5); |
| 190 | + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 5); |
| 191 | + _mm_storeu_pd(boffset + 42, xmm0); |
| 192 | + |
| 193 | + xmm0 = _mm_load_pd1(aoffset6 + 5); |
| 194 | + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 5); |
| 195 | + _mm_storeu_pd(boffset + 44, xmm0); |
| 196 | + |
| 197 | + xmm0 = _mm_load_pd1(aoffset8 + 5); |
| 198 | + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 5); |
| 199 | + _mm_storeu_pd(boffset + 46, xmm0); |
| 200 | + |
| 201 | + |
138 | 202 | ctemp47 = *(aoffset6 + 6);
|
139 | 203 | ctemp48 = *(aoffset6 + 7);
|
140 | 204 |
|
141 |
| - ctemp49 = *(aoffset7 + 0); |
142 |
| - ctemp50 = *(aoffset7 + 1); |
143 |
| - ctemp51 = *(aoffset7 + 2); |
144 |
| - ctemp52 = *(aoffset7 + 3); |
145 |
| - ctemp53 = *(aoffset7 + 4); |
146 |
| - ctemp54 = *(aoffset7 + 5); |
147 | 205 | ctemp55 = *(aoffset7 + 6);
|
148 | 206 | ctemp56 = *(aoffset7 + 7);
|
149 | 207 |
|
150 |
| - ctemp57 = *(aoffset8 + 0); |
151 |
| - ctemp58 = *(aoffset8 + 1); |
152 |
| - ctemp59 = *(aoffset8 + 2); |
153 |
| - ctemp60 = *(aoffset8 + 3); |
154 |
| - ctemp61 = *(aoffset8 + 4); |
155 |
| - ctemp62 = *(aoffset8 + 5); |
156 | 208 | ctemp63 = *(aoffset8 + 6);
|
157 | 209 | ctemp64 = *(aoffset8 + 7);
|
158 | 210 |
|
159 |
| - *(boffset + 0) = ctemp01; |
160 |
| - *(boffset + 1) = ctemp09; |
161 |
| - *(boffset + 2) = ctemp17; |
162 |
| - *(boffset + 3) = ctemp25; |
163 |
| - *(boffset + 4) = ctemp33; |
164 |
| - *(boffset + 5) = ctemp41; |
165 |
| - *(boffset + 6) = ctemp49; |
166 |
| - *(boffset + 7) = ctemp57; |
167 |
| - |
168 |
| - *(boffset + 8) = ctemp02; |
169 |
| - *(boffset + 9) = ctemp10; |
170 |
| - *(boffset + 10) = ctemp18; |
171 |
| - *(boffset + 11) = ctemp26; |
172 |
| - *(boffset + 12) = ctemp34; |
173 |
| - *(boffset + 13) = ctemp42; |
174 |
| - *(boffset + 14) = ctemp50; |
175 |
| - *(boffset + 15) = ctemp58; |
176 |
| - |
177 |
| - *(boffset + 16) = ctemp03; |
178 |
| - *(boffset + 17) = ctemp11; |
179 |
| - *(boffset + 18) = ctemp19; |
180 |
| - *(boffset + 19) = ctemp27; |
181 |
| - *(boffset + 20) = ctemp35; |
182 |
| - *(boffset + 21) = ctemp43; |
183 |
| - *(boffset + 22) = ctemp51; |
184 |
| - *(boffset + 23) = ctemp59; |
185 |
| - |
186 |
| - *(boffset + 24) = ctemp04; |
187 |
| - *(boffset + 25) = ctemp12; |
188 |
| - *(boffset + 26) = ctemp20; |
189 |
| - *(boffset + 27) = ctemp28; |
190 |
| - *(boffset + 28) = ctemp36; |
191 |
| - *(boffset + 29) = ctemp44; |
192 |
| - *(boffset + 30) = ctemp52; |
193 |
| - *(boffset + 31) = ctemp60; |
194 |
| - |
195 |
| - *(boffset + 32) = ctemp05; |
196 |
| - *(boffset + 33) = ctemp13; |
197 |
| - *(boffset + 34) = ctemp21; |
198 |
| - *(boffset + 35) = ctemp29; |
199 |
| - *(boffset + 36) = ctemp37; |
200 |
| - *(boffset + 37) = ctemp45; |
201 |
| - *(boffset + 38) = ctemp53; |
202 |
| - *(boffset + 39) = ctemp61; |
203 |
| - |
204 |
| - *(boffset + 40) = ctemp06; |
205 |
| - *(boffset + 41) = ctemp14; |
206 |
| - *(boffset + 42) = ctemp22; |
207 |
| - *(boffset + 43) = ctemp30; |
208 |
| - *(boffset + 44) = ctemp38; |
209 |
| - *(boffset + 45) = ctemp46; |
210 |
| - *(boffset + 46) = ctemp54; |
211 |
| - *(boffset + 47) = ctemp62; |
212 | 211 |
|
213 | 212 | *(boffset + 48) = ctemp07;
|
214 | 213 | *(boffset + 49) = ctemp15;
|
|
0 commit comments