Skip to content

Commit f49f804

Browse files
authored
Add files via upload
1 parent 825777f commit f49f804

File tree

1 file changed

+43
-7
lines changed

1 file changed

+43
-7
lines changed

kernel/x86_64/dgemm_kernel_4x8_haswell.S

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
279279
vmulpd %ymm0 , %ymm9 , %ymm9
280280
vmulpd %ymm0 , %ymm10, %ymm10
281281
vmulpd %ymm0 , %ymm11, %ymm11
282+
#if B_PR1 >= 96
282283
prefetcht0 128 + BUFFER1
284+
#endif
283285
vmulpd %ymm0 , %ymm12, %ymm12
284286
vmulpd %ymm0 , %ymm13, %ymm13
285287
vmulpd %ymm0 , %ymm14, %ymm14
286288
vmulpd %ymm0 , %ymm15, %ymm15
289+
#if B_PR1 >= 160
287290
prefetcht0 192 + BUFFER1
291+
#endif
288292
vpermilpd $ 0x05 , %ymm5, %ymm5
289293
vpermilpd $ 0x05 , %ymm7, %ymm7
290-
294+
#if B_PR1 >= 224
295+
prefetcht0 256 + BUFFER1
296+
#endif
291297
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
292298
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
293299
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
294300
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
295-
301+
#if B_PR1 >= 288
302+
prefetcht0 320 + BUFFER1
303+
#endif
296304
vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
297305
vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
298-
306+
#if B_PR1 >= 352
307+
prefetcht0 384 + BUFFER1
308+
#endif
299309
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
300310
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
301311
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
302312
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
303-
313+
#if B_PR1 >= 416
314+
prefetcht0 448 + BUFFER1
315+
#endif
304316
leaq (CO1, LDC, 2), %rax
305317

318+
#if B_PR1 >= 480
319+
prefetcht0 512 + BUFFER1
320+
#endif
306321

307322
#if !defined(TRMMKERNEL)
308323

@@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18671882

18681883
/* here for the prefetch of next b source block */
18691884
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
1870-
/* currently an increment of 128 byte is suitable */
1885+
18711886
salq $3, K
1887+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1888+
prefetcht2 32(B)
1889+
prefetcht2 32(B, K, 8)
1890+
addq $64, B /* increment */
1891+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
18721892
prefetcht2 32(B)
18731893
prefetcht2 32(B, K, 8)
18741894
prefetcht2 96(B)
18751895
prefetcht2 96(B, K, 8)
18761896
addq $128, B /* increment */
1897+
#endif
18771898
sarq $3, K
18781899

18791900
decq I # i --
@@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18831904
/**************************************************************************
18841905
* Rest of M
18851906
***************************************************************************/
1886-
/* recover the original value of pointer B */
1907+
1908+
/* recover the original value of pointer B after prefetch */
18871909
movq M, I
18881910
sarq $2, I
1911+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1912+
salq $6, I
1913+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
18891914
salq $7, I
1915+
#endif
18901916
subq I, B
18911917

18921918
.L12_20:
@@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21662192

21672193
/* here for the prefetch of next b source block */
21682194
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
2169-
/* currently an increment of 128 byte is suitable */
2195+
21702196
salq $3, K
2197+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2198+
prefetcht2 (B)
2199+
prefetcht2 (B, K, 8)
2200+
addq $64, B
2201+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
21712202
prefetcht2 (B)
21722203
prefetcht2 (B, K, 8)
21732204
prefetcht2 64(B)
21742205
prefetcht2 64(B, K, 8)
21752206
addq $128, B
2207+
#endif
21762208
sarq $3, K
21772209

21782210
decq I # i --
@@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21852217
/* recover the original value of pointer B */
21862218
movq M, I
21872219
sarq $2, I
2220+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2221+
salq $6, I
2222+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
21882223
salq $7, I
2224+
#endif
21892225
subq I, B
21902226

21912227
.L13_20:

0 commit comments

Comments
 (0)