@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
279
279
vmulpd %ymm0 , %ymm9 , %ymm9
280
280
vmulpd %ymm0 , %ymm10 , %ymm10
281
281
vmulpd %ymm0 , %ymm11 , %ymm11
282
+ #if B_PR1 > 32
282
283
prefetcht0 128 + BUFFER1
284
+ #endif
283
285
vmulpd %ymm0 , %ymm12 , %ymm12
284
286
vmulpd %ymm0 , %ymm13 , %ymm13
285
287
vmulpd %ymm0 , %ymm14 , %ymm14
286
288
vmulpd %ymm0 , %ymm15 , %ymm15
289
+ #if B_PR1 > 96
287
290
prefetcht0 192 + BUFFER1
291
+ #endif
288
292
vpermilpd $ 0x05 , %ymm5 , %ymm5
289
293
vpermilpd $ 0x05 , %ymm7 , %ymm7
290
-
294
+ #if B_PR1 > 160
295
+ prefetcht0 256 + BUFFER1
296
+ #endif
291
297
vblendpd $ 0x0a , %ymm5 , %ymm4 , %ymm0
292
298
vblendpd $ 0x05 , %ymm5 , %ymm4 , %ymm1
293
299
vblendpd $ 0x0a , %ymm7 , %ymm6 , %ymm2
294
300
vblendpd $ 0x05 , %ymm7 , %ymm6 , %ymm3
295
-
301
+ #if B_PR1 > 224
302
+ prefetcht0 320 + BUFFER1
303
+ #endif
296
304
vperm2f128 $ 0x01 , %ymm2 , %ymm2 , %ymm2
297
305
vperm2f128 $ 0x01 , %ymm3 , %ymm3 , %ymm3
298
-
306
+ #if B_PR1 > 288
307
+ prefetcht0 384 + BUFFER1
308
+ #endif
299
309
vblendpd $ 0x03 , %ymm0 , %ymm2 , %ymm4
300
310
vblendpd $ 0x03 , %ymm1 , %ymm3 , %ymm5
301
311
vblendpd $ 0x03 , %ymm2 , %ymm0 , %ymm6
302
312
vblendpd $ 0x03 , %ymm3 , %ymm1 , %ymm7
303
-
313
+ #if B_PR1 > 352
314
+ prefetcht0 448 + BUFFER1
315
+ #endif
304
316
leaq (CO1, LDC, 2 ), %rax
305
317
318
+ #if B_PR1 > 416
319
+ prefetcht0 512 + BUFFER1
320
+ #endif
306
321
307
322
#if !defined(TRMMKERNEL)
308
323
@@ -1613,29 +1628,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1613
1628
prefetcht0 24 (CO1,LDC,4 )
1614
1629
prefetcht0 (CO1,LDC,8 )
1615
1630
prefetcht0 24 (CO1,LDC,8 )
1616
- addq LDC,CO1
1617
- prefetcht0 (CO1)
1618
- prefetcht0 24 (CO1)
1619
- prefetcht0 (CO1,LDC,4 )
1620
- prefetcht0 24 (CO1,LDC,4 )
1621
- prefetcht0 (CO1,LDC,8 )
1622
- prefetcht0 24 (CO1,LDC,8 )
1623
- leaq (CO1,LDC,2 ),CO1
1624
- prefetcht0 (CO1)
1625
- prefetcht0 24 (CO1)
1626
- prefetcht0 (CO1,LDC,4 )
1627
- prefetcht0 24 (CO1,LDC,4 )
1628
- prefetcht0 (CO1,LDC,8 )
1629
- prefetcht0 24 (CO1,LDC,8 )
1630
- subq LDC,CO1
1631
- prefetcht0 (CO1)
1632
- prefetcht0 24 (CO1)
1633
- prefetcht0 (CO1,LDC,4 )
1634
- prefetcht0 24 (CO1,LDC,4 )
1635
- prefetcht0 (CO1,LDC,8 )
1636
- prefetcht0 24 (CO1,LDC,8 )
1637
- subq LDC,CO1
1638
- subq LDC,CO1
1639
1631
.endm
1640
1632
/*******************************************************************************************/
1641
1633
@@ -1805,12 +1797,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1805
1797
dec %rax
1806
1798
jne .L12_12
1807
1799
1808
- PREFETCHT0_C
1809
1800
.L12_12a:
1810
-
1801
+ prefetcht0 ALPHA
1802
+ PREFETCHT0_C
1803
+ addq LDC,CO1
1811
1804
KERNEL4x12_M1
1805
+ PREFETCHT0_C
1806
+ leaq (CO1,LDC,2 ),CO1
1812
1807
KERNEL4x12_M2
1808
+ PREFETCHT0_C
1809
+ subq LDC,CO1
1813
1810
KERNEL4x12_M1
1811
+ PREFETCHT0_C
1812
+ subq LDC,CO1
1813
+ subq LDC,CO1
1814
1814
KERNEL4x12_M2
1815
1815
1816
1816
KERNEL4x12_M1
@@ -1865,13 +1865,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1865
1865
1866
1866
SAVE4x12
1867
1867
1868
+ /* here for the prefetch of next b source block */
1869
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
1870
+
1871
+ salq $3 , K
1872
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1873
+ prefetcht2 32 (B)
1874
+ prefetcht2 32 (B, K, 8 )
1875
+ addq $64 , B /* increment */
1876
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1877
+ prefetcht2 32 (B)
1878
+ prefetcht2 32 (B, K, 8 )
1879
+ prefetcht2 96 (B)
1880
+ prefetcht2 96 (B, K, 8 )
1881
+ addq $128 , B /* increment */
1882
+ #endif
1883
+ sarq $3 , K
1884
+
1868
1885
decq I # i --
1869
1886
jne .L12_11
1870
1887
ALIGN_4
1871
1888
1872
1889
/**************************************************************************
1873
1890
* Rest of M
1874
1891
***************************************************************************/
1892
+
1893
+ /* recover the original value of pointer B after prefetch */
1894
+ movq M, I
1895
+ sarq $2 , I
1896
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
1897
+ salq $6 , I
1898
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
1899
+ salq $7 , I
1900
+ #endif
1901
+ subq I, B
1902
+
1875
1903
.L12_20:
1876
1904
// Test rest of M
1877
1905
@@ -2089,10 +2117,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2089
2117
jne .L13_12
2090
2118
2091
2119
.L13_12a:
2092
-
2120
+ prefetcht0 ALPHA
2121
+ PREFETCHT0_C
2122
+ addq LDC,CO1
2093
2123
KERNEL4x12_M1
2124
+ PREFETCHT0_C
2125
+ leaq (CO1,LDC,2 ),CO1
2094
2126
KERNEL4x12_M2
2127
+ PREFETCHT0_C
2128
+ subq LDC,CO1
2095
2129
KERNEL4x12_M1
2130
+ PREFETCHT0_C
2131
+ subq LDC,CO1
2132
+ subq LDC,CO1
2096
2133
KERNEL4x12_M2
2097
2134
2098
2135
KERNEL4x12_M1
@@ -2102,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2102
2139
2103
2140
jmp .L13_16
2104
2141
2105
-
2106
2142
.L13_13:
2107
2143
2108
2144
test $1 , %rax
@@ -2147,13 +2183,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2147
2183
2148
2184
SAVE4x12
2149
2185
2186
+ /* here for the prefetch of next b source block */
2187
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
2188
+
2189
+ salq $3 , K
2190
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2191
+ prefetcht2 (B)
2192
+ prefetcht2 (B, K, 8 )
2193
+ addq $64 , B /* increment */
2194
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2195
+ prefetcht2 (B)
2196
+ prefetcht2 (B, K, 8 )
2197
+ prefetcht2 64 (B)
2198
+ prefetcht2 64 (B, K, 8 )
2199
+ addq $128 , B /* increment */
2200
+ #endif
2201
+ sarq $3 , K
2202
+
2150
2203
decq I # i --
2151
2204
jne .L13_11
2152
2205
ALIGN_4
2153
2206
2154
2207
/**************************************************************************
2155
2208
* Rest of M
2156
2209
***************************************************************************/
2210
+ /* recover the original value of pointer B */
2211
+ movq M, I
2212
+ sarq $2 , I
2213
+ #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
2214
+ salq $6 , I
2215
+ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
2216
+ salq $7 , I
2217
+ #endif
2218
+ subq I, B
2219
+
2157
2220
.L13_20:
2158
2221
// Test rest of M
2159
2222
0 commit comments