@@ -249,26 +249,283 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
249
249
250
250
251
251
#if UNROLL_M == 16
252
- VECTOR_BLOCK (16 , 4 )
253
252
VECTOR_BLOCK (16 , 2 )
254
253
VECTOR_BLOCK (16 , 1 )
255
254
#endif
256
255
#if UNROLL_N == 8
257
256
VECTOR_BLOCK (8 , 8 )
258
257
VECTOR_BLOCK (4 , 8 )
259
258
#endif
259
+ #ifndef DOUBLE
260
260
VECTOR_BLOCK (8 , 4 )
261
+ #endif
261
262
VECTOR_BLOCK (8 , 2 )
262
263
VECTOR_BLOCK (8 , 1 )
263
264
VECTOR_BLOCK (4 , 4 )
264
265
VECTOR_BLOCK (4 , 2 )
265
266
VECTOR_BLOCK (4 , 1 )
266
267
268
+ /**
269
+ * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
270
+ * Simple implementation for smaller block sizes
271
+ *
272
+ * @param[in] A Pointer current block of input matrix A.
273
+ * @param[in] k Number of columns in A.
274
+ * @param[in] B Pointer current block of input matrix B.
275
+ * @param[inout] C Pointer current block of output matrix C.
276
+ * @param[in] ldc Offset between elements in adjacent columns in C.
277
+ * @param[in] alpha Scalar factor.
278
+ */
279
+ #define SCALAR_BLOCK (ROWS , COLS ) \
280
+ static inline void GEBP_block_##ROWS##_##COLS( \
281
+ FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \
282
+ FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
283
+ FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \
284
+ \
285
+ /* \
286
+ * Peel off first iteration (i.e., column of A) for \
287
+ * initializing Caux \
288
+ */ \
289
+ for (BLASLONG i = 0 ; i < ROWS ; i ++ ) \
290
+ for (BLASLONG j = 0 ; j < COLS ; j ++ ) Caux [i ][j ] = A [i ] * B [j ]; \
291
+ \
292
+ for (BLASLONG kk = 1 ; kk < k ; kk ++ ) \
293
+ for (BLASLONG i = 0 ; i < ROWS ; i ++ ) \
294
+ for (BLASLONG j = 0 ; j < COLS ; j ++ ) \
295
+ Caux [i ][j ] += A [i + kk * ROWS ] * B [j + kk * COLS ]; \
296
+ \
297
+ for (BLASLONG i = 0 ; i < ROWS ; i ++ ) \
298
+ for (BLASLONG j = 0 ; j < COLS ; j ++ ) \
299
+ if (trmm ) { \
300
+ C [i + j * ldc ] = alpha * Caux [i ][j ]; \
301
+ } else { \
302
+ C [i + j * ldc ] += alpha * Caux [i ][j ]; \
303
+ } \
304
+ }
305
+
267
306
#ifdef DOUBLE
268
307
VECTOR_BLOCK (2 , 4 )
269
308
VECTOR_BLOCK (2 , 2 )
309
+ VECTOR_BLOCK (2 , 1 )
310
+ #else
311
+ SCALAR_BLOCK (2 , 4 )
312
+ SCALAR_BLOCK (2 , 2 )
313
+ SCALAR_BLOCK (2 , 1 )
314
+ #endif
315
+
316
+ SCALAR_BLOCK (1 , 4 )
317
+ SCALAR_BLOCK (1 , 2 )
318
+ SCALAR_BLOCK (1 , 1 )
319
+
320
+
321
+ /**
322
+ * Calculate a row-block that fits 4x4 vector registers using a loop
323
+ * unrolled-by-2 with explicit interleaving to better overlap loads and
324
+ * computation.
325
+ * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
326
+ */
327
+ #ifdef DOUBLE
328
+ static inline void GEBP_block_8_4 (
329
+ #else // float
330
+ static inline void GEBP_block_16_4 (
331
+ #endif
332
+ FLOAT const * restrict A , BLASLONG bk , FLOAT const * restrict B ,
333
+ FLOAT * restrict C , BLASLONG ldc , FLOAT alpha ) {
334
+ #define VEC_ROWS 4
335
+ #define VEC_COLS 4
336
+ #define ROWS VEC_ROWS * VLEN_FLOATS
337
+ #define COLS (VEC_COLS)
338
+
339
+ /*
340
+ * Hold intermediate results in vector registers.
341
+ * Since we need to force the compiler's hand in places, we need to use
342
+ * individual variables in contrast to the generic implementation's
343
+ * arrays.
344
+ */
345
+ #define INIT_ROW_OF_C (ROW ) \
346
+ vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
347
+ vector_float C_##ROW##_0 = A##ROW * B[0]; \
348
+ vector_float C_##ROW##_1 = A##ROW * B[1]; \
349
+ vector_float C_##ROW##_2 = A##ROW * B[2]; \
350
+ vector_float C_##ROW##_3 = A##ROW * B[3];
351
+
352
+ INIT_ROW_OF_C (0 )
353
+ INIT_ROW_OF_C (1 )
354
+ INIT_ROW_OF_C (2 )
355
+ INIT_ROW_OF_C (3 )
356
+ #undef INIT_ROW_OF_C
357
+
358
+ if (bk > 1 ) {
359
+ BLASLONG k = 1 ;
360
+ vector_float Ak [VEC_ROWS ], Aknext [VEC_ROWS ];
361
+ vector_float Bk [VEC_COLS ], Bknext [VEC_COLS ];
362
+
363
+ /*
364
+ * Note that in several places, we enforce an instruction
365
+ * sequence that we identified empirically by utilizing dummy
366
+ * asm statements.
367
+ */
368
+
369
+ for (BLASLONG j = 0 ; j < VEC_COLS ; j ++ )
370
+ Bk [j ] = vec_splats (B [j + k * COLS ]);
371
+ asm("" );
372
+
373
+ for (BLASLONG i = 0 ; i < VEC_ROWS ; i ++ )
374
+ Ak [i ] = vec_load_hinted (A + i * VLEN_FLOATS + k * ROWS );
375
+
376
+ for (; k < (bk - 2 ); k += 2 ) {
377
+ /*
378
+ * Load inputs for (k+1) into registers.
379
+ * Loading from B first is advantageous.
380
+ */
381
+ for (BLASLONG j = 0 ; j < VEC_COLS ; j ++ )
382
+ Bknext [j ] = vec_splats (B [j + (k + 1 ) * COLS ]);
383
+ asm("" );
384
+ for (BLASLONG i = 0 ; i < VEC_ROWS ; i ++ )
385
+ Aknext [i ] = vec_load_hinted (A + i * VLEN_FLOATS +
386
+ (k + 1 ) * ROWS );
387
+
388
+ /*
389
+ * To achieve better instruction-level parallelism,
390
+ * make sure to first load input data for (k+1) before
391
+ * initiating compute for k. We enforce that ordering
392
+ * with a pseudo asm statement.
393
+ * Note that we need to massage this particular "barrier"
394
+ * depending on the gcc version.
395
+ */
396
+ #if __GNUC__ > 7
397
+ #define BARRIER_READ_BEFORE_COMPUTE (SUFFIX ) \
398
+ do { \
399
+ asm("" \
400
+ : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
401
+ "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \
402
+ : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
403
+ "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
404
+ "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
405
+ asm("" \
406
+ : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
407
+ "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \
408
+ : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
409
+ "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
410
+ "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
411
+ } while (0)
412
+ #else // __GNUC__ <= 7
413
+ #define BARRIER_READ_BEFORE_COMPUTE (SUFFIX ) \
414
+ do { \
415
+ asm(""); \
416
+ } while (0)
270
417
#endif
271
418
419
+ BARRIER_READ_BEFORE_COMPUTE (knext );
420
+
421
+ /* Compute for (k) */
422
+ C_0_0 += Ak [0 ] * Bk [0 ];
423
+ C_1_0 += Ak [1 ] * Bk [0 ];
424
+ C_2_0 += Ak [2 ] * Bk [0 ];
425
+ C_3_0 += Ak [3 ] * Bk [0 ];
426
+
427
+ C_0_1 += Ak [0 ] * Bk [1 ];
428
+ C_1_1 += Ak [1 ] * Bk [1 ];
429
+ C_2_1 += Ak [2 ] * Bk [1 ];
430
+ C_3_1 += Ak [3 ] * Bk [1 ];
431
+
432
+ C_0_2 += Ak [0 ] * Bk [2 ];
433
+ C_1_2 += Ak [1 ] * Bk [2 ];
434
+ C_2_2 += Ak [2 ] * Bk [2 ];
435
+ C_3_2 += Ak [3 ] * Bk [2 ];
436
+
437
+ C_0_3 += Ak [0 ] * Bk [3 ];
438
+ C_1_3 += Ak [1 ] * Bk [3 ];
439
+ C_2_3 += Ak [2 ] * Bk [3 ];
440
+ C_3_3 += Ak [3 ] * Bk [3 ];
441
+
442
+ asm("" );
443
+
444
+ /*
445
+ * Load inputs for (k+2) into registers.
446
+ * First load from B.
447
+ */
448
+ for (BLASLONG j = 0 ; j < VEC_COLS ; j ++ )
449
+ Bk [j ] = vec_splats (B [j + (k + 2 ) * COLS ]);
450
+ asm("" );
451
+ for (BLASLONG i = 0 ; i < VEC_ROWS ; i ++ )
452
+ Ak [i ] = vec_load_hinted (A + i * VLEN_FLOATS + (k + 2 ) * ROWS );
453
+
454
+ /*
455
+ * As above, make sure to first schedule the loads for (k+2)
456
+ * before compute for (k+1).
457
+ */
458
+ BARRIER_READ_BEFORE_COMPUTE (k );
459
+
460
+ /* Compute on (k+1) */
461
+ C_0_0 += Aknext [0 ] * Bknext [0 ];
462
+ C_1_0 += Aknext [1 ] * Bknext [0 ];
463
+ C_2_0 += Aknext [2 ] * Bknext [0 ];
464
+ C_3_0 += Aknext [3 ] * Bknext [0 ];
465
+
466
+ C_0_1 += Aknext [0 ] * Bknext [1 ];
467
+ C_1_1 += Aknext [1 ] * Bknext [1 ];
468
+ C_2_1 += Aknext [2 ] * Bknext [1 ];
469
+ C_3_1 += Aknext [3 ] * Bknext [1 ];
470
+
471
+ C_0_2 += Aknext [0 ] * Bknext [2 ];
472
+ C_1_2 += Aknext [1 ] * Bknext [2 ];
473
+ C_2_2 += Aknext [2 ] * Bknext [2 ];
474
+ C_3_2 += Aknext [3 ] * Bknext [2 ];
475
+
476
+ C_0_3 += Aknext [0 ] * Bknext [3 ];
477
+ C_1_3 += Aknext [1 ] * Bknext [3 ];
478
+ C_2_3 += Aknext [2 ] * Bknext [3 ];
479
+ C_3_3 += Aknext [3 ] * Bknext [3 ];
480
+ }
481
+
482
+ /* Wrapup remaining k's */
483
+ for (; k < bk ; k ++ ) {
484
+ vector_float Ak ;
485
+
486
+ #define COMPUTE_WRAPUP_ROW (ROW ) \
487
+ Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
488
+ C_##ROW##_0 += Ak * B[0 + k * COLS]; \
489
+ C_##ROW##_1 += Ak * B[1 + k * COLS]; \
490
+ C_##ROW##_2 += Ak * B[2 + k * COLS]; \
491
+ C_##ROW##_3 += Ak * B[3 + k * COLS];
492
+
493
+ COMPUTE_WRAPUP_ROW (0 )
494
+ COMPUTE_WRAPUP_ROW (1 )
495
+ COMPUTE_WRAPUP_ROW (2 )
496
+ COMPUTE_WRAPUP_ROW (3 )
497
+ #undef COMPUTE_WRAPUP_ROW
498
+ }
499
+ }
500
+
501
+ /*
502
+ * Unpack row-block of C_aux into outer C_i, multiply by
503
+ * alpha and add up (or assign for TRMM).
504
+ */
505
+ #define WRITE_BACK_C (ROW , COL ) \
506
+ do { \
507
+ vector_float *Cij = \
508
+ (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
509
+ if (trmm) { \
510
+ *Cij = alpha * C_##ROW##_##COL; \
511
+ } else { \
512
+ *Cij += alpha * C_##ROW##_##COL; \
513
+ } \
514
+ } while (0)
515
+
516
+ WRITE_BACK_C (0 , 0 ); WRITE_BACK_C (0 , 1 ); WRITE_BACK_C (0 , 2 ); WRITE_BACK_C (0 , 3 );
517
+ WRITE_BACK_C (1 , 0 ); WRITE_BACK_C (1 , 1 ); WRITE_BACK_C (1 , 2 ); WRITE_BACK_C (1 , 3 );
518
+ WRITE_BACK_C (2 , 0 ); WRITE_BACK_C (2 , 1 ); WRITE_BACK_C (2 , 2 ); WRITE_BACK_C (2 , 3 );
519
+ WRITE_BACK_C (3 , 0 ); WRITE_BACK_C (3 , 1 ); WRITE_BACK_C (3 , 2 ); WRITE_BACK_C (3 , 3 );
520
+ #undef WRITE_BACK_C
521
+
522
+ #undef ROWS
523
+ #undef VEC_ROWS
524
+ #undef COLS
525
+ #undef VEC_COLS
526
+ #undef BARRIER_READ_BEFORE_COMPUTE
527
+ }
528
+
272
529
/**
273
530
* Handle calculation for row blocks in C_i of any size by dispatching into
274
531
* macro-defined (inline) functions or by deferring to a simple generic
@@ -315,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
315
572
}
316
573
}
317
574
575
+ /* Dispatch into the implementation for each block size: */
576
+
318
577
#define BLOCK (bm , bn ) \
319
578
if (m == bm && n == bn) { \
320
579
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -330,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
330
589
BLOCK (8 , 4 ); BLOCK (8 , 2 ); BLOCK (8 , 1 );
331
590
BLOCK (4 , 4 ); BLOCK (4 , 2 ); BLOCK (4 , 1 );
332
591
333
- #ifdef DOUBLE
334
- BLOCK (2 , 4 );
335
- BLOCK (2 , 2 );
336
- #endif
337
-
338
- #undef BLOCK
592
+ BLOCK (2 , 4 ); BLOCK (2 , 2 ); BLOCK (2 , 1 );
339
593
340
- /* simple implementation for smaller block sizes: */
341
- FLOAT Caux [m ][n ] __attribute__ ((aligned (16 )));
594
+ BLOCK (1 , 4 ); BLOCK (1 , 2 ); BLOCK (1 , 1 );
342
595
343
- /*
344
- * Peel off first iteration (i.e., column of A) for initializing Caux
345
- */
346
- for (BLASLONG i = 0 ; i < m ; i ++ )
347
- for (BLASLONG j = 0 ; j < n ; j ++ )
348
- Caux [i ][j ] = A [i ] * B [j ];
349
-
350
- for (BLASLONG kk = 1 ; kk < k ; kk ++ )
351
- for (BLASLONG i = 0 ; i < m ; i ++ )
352
- for (BLASLONG j = 0 ; j < n ; j ++ )
353
- Caux [i ][j ] += A [i + kk * m ] * B [j + kk * n ];
354
-
355
- for (BLASLONG i = 0 ; i < m ; i ++ )
356
- for (BLASLONG j = 0 ; j < n ; j ++ )
357
- if (trmm ) {
358
- C [i + j * ldc ] = alpha * Caux [i ][j ];
359
- } else {
360
- C [i + j * ldc ] += alpha * Caux [i ][j ];
361
- }
596
+ #undef BLOCK
362
597
}
363
598
364
599
/**
0 commit comments