@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
279
279
vmulpd %ymm0 , %ymm9 , %ymm9
280
280
vmulpd %ymm0 , %ymm10 , %ymm10
281
281
vmulpd %ymm0 , %ymm11 , %ymm11
282
+ #if B_PR1 >= 96
282
283
prefetcht0 128 + BUFFER1
284
+ #endif
283
285
vmulpd %ymm0 , %ymm12 , %ymm12
284
286
vmulpd %ymm0 , %ymm13 , %ymm13
285
287
vmulpd %ymm0 , %ymm14 , %ymm14
286
288
vmulpd %ymm0 , %ymm15 , %ymm15
289
+ #if B_PR1 >= 160
287
290
prefetcht0 192 + BUFFER1
291
+ #endif
288
292
vpermilpd $ 0x05 , %ymm5 , %ymm5
289
293
vpermilpd $ 0x05 , %ymm7 , %ymm7
290
-
294
+ #if B_PR1 >= 224
295
+ prefetcht0 256 + BUFFER1
296
+ #endif
291
297
vblendpd $ 0x0a , %ymm5 , %ymm4 , %ymm0
292
298
vblendpd $ 0x05 , %ymm5 , %ymm4 , %ymm1
293
299
vblendpd $ 0x0a , %ymm7 , %ymm6 , %ymm2
294
300
vblendpd $ 0x05 , %ymm7 , %ymm6 , %ymm3
295
-
301
+ #if B_PR1 >= 288
302
+ prefetcht0 320 + BUFFER1
303
+ #endif
296
304
vperm2f128 $ 0x01 , %ymm2 , %ymm2 , %ymm2
297
305
vperm2f128 $ 0x01 , %ymm3 , %ymm3 , %ymm3
298
-
306
+ #if B_PR1 >= 352
307
+ prefetcht0 384 + BUFFER1
308
+ #endif
299
309
vblendpd $ 0x03 , %ymm0 , %ymm2 , %ymm4
300
310
vblendpd $ 0x03 , %ymm1 , %ymm3 , %ymm5
301
311
vblendpd $ 0x03 , %ymm2 , %ymm0 , %ymm6
302
312
vblendpd $ 0x03 , %ymm3 , %ymm1 , %ymm7
303
-
313
+ #if B_PR1 >= 416
314
+ prefetcht0 448 + BUFFER1
315
+ #endif
304
316
leaq (CO1, LDC, 2 ), %rax
305
317
318
+ #if B_PR1 >= 480
319
+ prefetcht0 512 + BUFFER1
320
+ #endif
306
321
307
322
#if !defined(TRMMKERNEL)
308
323
@@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1867
1882
1868
1883
/* here for the prefetch of next b source block */
1869
1884
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
1870
- /* currently an increment of 128 byte is suitable */
1885
+
1871
1886
salq $3 , K
1887
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1888
+ prefetcht2 32 (B)
1889
+ prefetcht2 32 (B, K, 8 )
1890
+ addq $64 , B /* increment */
1891
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1872
1892
prefetcht2 32 (B)
1873
1893
prefetcht2 32 (B, K, 8 )
1874
1894
prefetcht2 96 (B)
1875
1895
prefetcht2 96 (B, K, 8 )
1876
1896
addq $128 , B /* increment */
1897
+ #endif
1877
1898
sarq $3 , K
1878
1899
1879
1900
decq I # i --
@@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1883
1904
/**************************************************************************
1884
1905
* Rest of M
1885
1906
***************************************************************************/
1886
- /* recover the original value of pointer B */
1907
+
1908
+ /* recover the original value of pointer B after prefetch */
1887
1909
movq M, I
1888
1910
sarq $2 , I
1911
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1912
+ salq $6 , I
1913
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1889
1914
salq $7 , I
1915
+ #endif
1890
1916
subq I, B
1891
1917
1892
1918
.L12_20:
@@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2166
2192
2167
2193
/* here for the prefetch of next b source block */
2168
2194
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
2169
- /* currently an increment of 128 byte is suitable */
2195
+
2170
2196
salq $3 , K
2197
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2198
+ prefetcht2 (B)
2199
+ prefetcht2 (B, K, 8 )
2200
+ addq $64 , B
2201
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2171
2202
prefetcht2 (B)
2172
2203
prefetcht2 (B, K, 8 )
2173
2204
prefetcht2 64 (B)
2174
2205
prefetcht2 64 (B, K, 8 )
2175
2206
addq $128 , B
2207
+ #endif
2176
2208
sarq $3 , K
2177
2209
2178
2210
decq I # i --
@@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2185
2217
/* recover the original value of pointer B */
2186
2218
movq M, I
2187
2219
sarq $2 , I
2220
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2221
+ salq $6 , I
2222
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2188
2223
salq $7 , I
2224
+ #endif
2189
2225
subq I, B
2190
2226
2191
2227
.L13_20:
0 commit comments