Skip to content

Commit d14cf1c

Browse files
authored
Merge pull request #2189 from wjc404/develop
Update dgemm_kernel_4x8_haswell.S for reducing cache misses
2 parents b0b7600 + 95fb98f commit d14cf1c

File tree

1 file changed

+94
-31
lines changed

1 file changed

+94
-31
lines changed

kernel/x86_64/dgemm_kernel_4x8_haswell.S

Lines changed: 94 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
279279
vmulpd %ymm0 , %ymm9 , %ymm9
280280
vmulpd %ymm0 , %ymm10, %ymm10
281281
vmulpd %ymm0 , %ymm11, %ymm11
282+
#if B_PR1 > 32
282283
prefetcht0 128 + BUFFER1
284+
#endif
283285
vmulpd %ymm0 , %ymm12, %ymm12
284286
vmulpd %ymm0 , %ymm13, %ymm13
285287
vmulpd %ymm0 , %ymm14, %ymm14
286288
vmulpd %ymm0 , %ymm15, %ymm15
289+
#if B_PR1 > 96
287290
prefetcht0 192 + BUFFER1
291+
#endif
288292
vpermilpd $ 0x05 , %ymm5, %ymm5
289293
vpermilpd $ 0x05 , %ymm7, %ymm7
290-
294+
#if B_PR1 > 160
295+
prefetcht0 256 + BUFFER1
296+
#endif
291297
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
292298
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
293299
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
294300
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
295-
301+
#if B_PR1 > 224
302+
prefetcht0 320 + BUFFER1
303+
#endif
296304
vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
297305
vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
298-
306+
#if B_PR1 > 288
307+
prefetcht0 384 + BUFFER1
308+
#endif
299309
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
300310
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
301311
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
302312
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
303-
313+
#if B_PR1 > 352
314+
prefetcht0 448 + BUFFER1
315+
#endif
304316
leaq (CO1, LDC, 2), %rax
305317

318+
#if B_PR1 > 416
319+
prefetcht0 512 + BUFFER1
320+
#endif
306321

307322
#if !defined(TRMMKERNEL)
308323

@@ -1613,29 +1628,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16131628
prefetcht0 24(CO1,LDC,4)
16141629
prefetcht0 (CO1,LDC,8)
16151630
prefetcht0 24(CO1,LDC,8)
1616-
addq LDC,CO1
1617-
prefetcht0 (CO1)
1618-
prefetcht0 24(CO1)
1619-
prefetcht0 (CO1,LDC,4)
1620-
prefetcht0 24(CO1,LDC,4)
1621-
prefetcht0 (CO1,LDC,8)
1622-
prefetcht0 24(CO1,LDC,8)
1623-
leaq (CO1,LDC,2),CO1
1624-
prefetcht0 (CO1)
1625-
prefetcht0 24(CO1)
1626-
prefetcht0 (CO1,LDC,4)
1627-
prefetcht0 24(CO1,LDC,4)
1628-
prefetcht0 (CO1,LDC,8)
1629-
prefetcht0 24(CO1,LDC,8)
1630-
subq LDC,CO1
1631-
prefetcht0 (CO1)
1632-
prefetcht0 24(CO1)
1633-
prefetcht0 (CO1,LDC,4)
1634-
prefetcht0 24(CO1,LDC,4)
1635-
prefetcht0 (CO1,LDC,8)
1636-
prefetcht0 24(CO1,LDC,8)
1637-
subq LDC,CO1
1638-
subq LDC,CO1
16391631
.endm
16401632
/*******************************************************************************************/
16411633

@@ -1805,12 +1797,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18051797
dec %rax
18061798
jne .L12_12
18071799

1808-
PREFETCHT0_C
18091800
.L12_12a:
1810-
1801+
prefetcht0 ALPHA
1802+
PREFETCHT0_C
1803+
addq LDC,CO1
18111804
KERNEL4x12_M1
1805+
PREFETCHT0_C
1806+
leaq (CO1,LDC,2),CO1
18121807
KERNEL4x12_M2
1808+
PREFETCHT0_C
1809+
subq LDC,CO1
18131810
KERNEL4x12_M1
1811+
PREFETCHT0_C
1812+
subq LDC,CO1
1813+
subq LDC,CO1
18141814
KERNEL4x12_M2
18151815

18161816
KERNEL4x12_M1
@@ -1865,13 +1865,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18651865

18661866
SAVE4x12
18671867

1868+
/* here for the prefetch of next b source block */
1869+
/* the increment should be proportional to GEMM_Q/GEMM_P */
1870+
1871+
salq $3, K
1872+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1873+
prefetcht2 32(B)
1874+
prefetcht2 32(B, K, 8)
1875+
addq $64, B /* increment */
1876+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1877+
prefetcht2 32(B)
1878+
prefetcht2 32(B, K, 8)
1879+
prefetcht2 96(B)
1880+
prefetcht2 96(B, K, 8)
1881+
addq $128, B /* increment */
1882+
#endif
1883+
sarq $3, K
1884+
18681885
decq I # i --
18691886
jne .L12_11
18701887
ALIGN_4
18711888

18721889
/**************************************************************************
18731890
* Rest of M
18741891
***************************************************************************/
1892+
1893+
/* recover the original value of pointer B after prefetch */
1894+
movq M, I
1895+
sarq $2, I
1896+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1897+
salq $6, I
1898+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1899+
salq $7, I
1900+
#endif
1901+
subq I, B
1902+
18751903
.L12_20:
18761904
// Test rest of M
18771905

@@ -2089,10 +2117,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20892117
jne .L13_12
20902118

20912119
.L13_12a:
2092-
2120+
prefetcht0 ALPHA
2121+
PREFETCHT0_C
2122+
addq LDC,CO1
20932123
KERNEL4x12_M1
2124+
PREFETCHT0_C
2125+
leaq (CO1,LDC,2),CO1
20942126
KERNEL4x12_M2
2127+
PREFETCHT0_C
2128+
subq LDC,CO1
20952129
KERNEL4x12_M1
2130+
PREFETCHT0_C
2131+
subq LDC,CO1
2132+
subq LDC,CO1
20962133
KERNEL4x12_M2
20972134

20982135
KERNEL4x12_M1
@@ -2102,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21022139

21032140
jmp .L13_16
21042141

2105-
21062142
.L13_13:
21072143

21082144
test $1, %rax
@@ -2147,13 +2183,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21472183

21482184
SAVE4x12
21492185

2186+
/* here for the prefetch of next b source block */
2187+
/* the increment should be proportional to GEMM_Q/GEMM_P */
2188+
2189+
salq $3, K
2190+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2191+
prefetcht2 (B)
2192+
prefetcht2 (B, K, 8)
2193+
addq $64, B /* increment */
2194+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2195+
prefetcht2 (B)
2196+
prefetcht2 (B, K, 8)
2197+
prefetcht2 64(B)
2198+
prefetcht2 64(B, K, 8)
2199+
addq $128, B /* increment */
2200+
#endif
2201+
sarq $3, K
2202+
21502203
decq I # i --
21512204
jne .L13_11
21522205
ALIGN_4
21532206

21542207
/**************************************************************************
21552208
* Rest of M
21562209
***************************************************************************/
2210+
/* recover the original value of pointer B */
2211+
movq M, I
2212+
sarq $2, I
2213+
#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2214+
salq $6, I
2215+
#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2216+
salq $7, I
2217+
#endif
2218+
subq I, B
2219+
21572220
.L13_20:
21582221
// Test rest of M
21592222

0 commit comments

Comments
 (0)