Skip to content

Commit 9f0ef9c

Browse files
authored
Merge pull request #77 from xianyi/develop
rebase
2 parents 6bfc666 + efdd237 commit 9f0ef9c

File tree

6 files changed

+286
-33
lines changed

6 files changed

+286
-33
lines changed

.travis.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,23 @@ matrix:
7575
- TARGET_BOX=LINUX32
7676
- BTYPE="BINARY=32"
7777

78+
- os: linux
79+
arch: ppc64le
80+
dist: bionic
81+
compiler: gcc
82+
before_script:
83+
- sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
84+
- sudo apt-get update
85+
- sudo apt-get install gcc-9 gfortran-9 -y
86+
script:
87+
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
88+
- make -C test $COMMON_FLAGS $BTYPE
89+
- make -C ctest $COMMON_FLAGS $BTYPE
90+
- make -C utest $COMMON_FLAGS $BTYPE
91+
env:
92+
# for matrix annotation only
93+
- TARGET_BOX=PPC64LE_LINUX_P9
94+
7895
- os: linux
7996
compiler: gcc
8097
addons:

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ ifndef NO_FBLAS
141141
$(MAKE) -C test all
142142
endif
143143
$(MAKE) -C utest all
144-
ifndef NO_CBLAS
144+
ifneq ($(NO_CBLAS), 1)
145145
$(MAKE) -C ctest all
146146
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
147147
$(MAKE) -C cpp_thread_test all
@@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
244244
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
245245
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
246246
endif
247-
ifndef NO_LAPACKE
247+
ifneq ($(NO_LAPACKE), 1)
248248
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
249249
endif
250250
endif

common_power.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
105105
" bne- 1f\n"
106106
" stwcx. %2,0, %1\n"
107107
" bne- 0b\n"
108+
" isync\n"
108109
"1: "
109110
: "=&r"(ret)
110111
: "r"(address), "r" (val)

interface/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX)
367367
endif
368368

369369

370-
ifndef NO_CBLAS
370+
ifneq ($(NO_CBLAS), 1)
371371

372372
override CFLAGS += -I.
373373

kernel/zarch/gemm_vec.c

Lines changed: 263 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -249,26 +249,283 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
249249

250250

251251
#if UNROLL_M == 16
252-
VECTOR_BLOCK(16, 4)
253252
VECTOR_BLOCK(16, 2)
254253
VECTOR_BLOCK(16, 1)
255254
#endif
256255
#if UNROLL_N == 8
257256
VECTOR_BLOCK(8, 8)
258257
VECTOR_BLOCK(4, 8)
259258
#endif
259+
#ifndef DOUBLE
260260
VECTOR_BLOCK(8, 4)
261+
#endif
261262
VECTOR_BLOCK(8, 2)
262263
VECTOR_BLOCK(8, 1)
263264
VECTOR_BLOCK(4, 4)
264265
VECTOR_BLOCK(4, 2)
265266
VECTOR_BLOCK(4, 1)
266267

268+
/**
269+
* Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
270+
* Simple implementation for smaller block sizes
271+
*
272+
* @param[in] A Pointer current block of input matrix A.
273+
* @param[in] k Number of columns in A.
274+
* @param[in] B Pointer current block of input matrix B.
275+
* @param[inout] C Pointer current block of output matrix C.
276+
* @param[in] ldc Offset between elements in adjacent columns in C.
277+
* @param[in] alpha Scalar factor.
278+
*/
279+
#define SCALAR_BLOCK(ROWS, COLS) \
280+
static inline void GEBP_block_##ROWS##_##COLS( \
281+
FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \
282+
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
283+
FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \
284+
\
285+
/* \
286+
* Peel off first iteration (i.e., column of A) for \
287+
* initializing Caux \
288+
*/ \
289+
for (BLASLONG i = 0; i < ROWS; i++) \
290+
for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
291+
\
292+
for (BLASLONG kk = 1; kk < k; kk++) \
293+
for (BLASLONG i = 0; i < ROWS; i++) \
294+
for (BLASLONG j = 0; j < COLS; j++) \
295+
Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \
296+
\
297+
for (BLASLONG i = 0; i < ROWS; i++) \
298+
for (BLASLONG j = 0; j < COLS; j++) \
299+
if (trmm) { \
300+
C[i + j * ldc] = alpha * Caux[i][j]; \
301+
} else { \
302+
C[i + j * ldc] += alpha * Caux[i][j]; \
303+
} \
304+
}
305+
267306
#ifdef DOUBLE
268307
VECTOR_BLOCK(2, 4)
269308
VECTOR_BLOCK(2, 2)
309+
VECTOR_BLOCK(2, 1)
310+
#else
311+
SCALAR_BLOCK(2, 4)
312+
SCALAR_BLOCK(2, 2)
313+
SCALAR_BLOCK(2, 1)
314+
#endif
315+
316+
SCALAR_BLOCK(1, 4)
317+
SCALAR_BLOCK(1, 2)
318+
SCALAR_BLOCK(1, 1)
319+
320+
321+
/**
322+
* Calculate a row-block that fits 4x4 vector registers using a loop
323+
* unrolled-by-2 with explicit interleaving to better overlap loads and
324+
* computation.
325+
* This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
326+
*/
327+
#ifdef DOUBLE
328+
static inline void GEBP_block_8_4(
329+
#else // float
330+
static inline void GEBP_block_16_4(
331+
#endif
332+
FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
333+
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
334+
#define VEC_ROWS 4
335+
#define VEC_COLS 4
336+
#define ROWS VEC_ROWS * VLEN_FLOATS
337+
#define COLS (VEC_COLS)
338+
339+
/*
340+
* Hold intermediate results in vector registers.
341+
* Since we need to force the compiler's hand in places, we need to use
342+
* individual variables in contrast to the generic implementation's
343+
* arrays.
344+
*/
345+
#define INIT_ROW_OF_C(ROW) \
346+
vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
347+
vector_float C_##ROW##_0 = A##ROW * B[0]; \
348+
vector_float C_##ROW##_1 = A##ROW * B[1]; \
349+
vector_float C_##ROW##_2 = A##ROW * B[2]; \
350+
vector_float C_##ROW##_3 = A##ROW * B[3];
351+
352+
INIT_ROW_OF_C(0)
353+
INIT_ROW_OF_C(1)
354+
INIT_ROW_OF_C(2)
355+
INIT_ROW_OF_C(3)
356+
#undef INIT_ROW_OF_C
357+
358+
if (bk > 1) {
359+
BLASLONG k = 1;
360+
vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
361+
vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
362+
363+
/*
364+
* Note that in several places, we enforce an instruction
365+
* sequence that we identified empirically by utilizing dummy
366+
* asm statements.
367+
*/
368+
369+
for (BLASLONG j = 0; j < VEC_COLS; j++)
370+
Bk[j] = vec_splats(B[j + k * COLS]);
371+
asm("");
372+
373+
for (BLASLONG i = 0; i < VEC_ROWS; i++)
374+
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
375+
376+
for (; k < (bk - 2); k += 2) {
377+
/*
378+
* Load inputs for (k+1) into registers.
379+
* Loading from B first is advantageous.
380+
*/
381+
for (BLASLONG j = 0; j < VEC_COLS; j++)
382+
Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
383+
asm("");
384+
for (BLASLONG i = 0; i < VEC_ROWS; i++)
385+
Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
386+
(k + 1) * ROWS);
387+
388+
/*
389+
* To achieve better instruction-level parallelism,
390+
* make sure to first load input data for (k+1) before
391+
* initiating compute for k. We enforce that ordering
392+
* with a pseudo asm statement.
393+
* Note that we need to massage this particular "barrier"
394+
* depending on the gcc version.
395+
*/
396+
#if __GNUC__ > 7
397+
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
398+
do { \
399+
asm("" \
400+
: "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
401+
"+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \
402+
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
403+
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
404+
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
405+
asm("" \
406+
: "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
407+
"+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \
408+
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
409+
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
410+
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
411+
} while (0)
412+
#else // __GNUC__ <= 7
413+
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
414+
do { \
415+
asm(""); \
416+
} while (0)
270417
#endif
271418

419+
BARRIER_READ_BEFORE_COMPUTE(knext);
420+
421+
/* Compute for (k) */
422+
C_0_0 += Ak[0] * Bk[0];
423+
C_1_0 += Ak[1] * Bk[0];
424+
C_2_0 += Ak[2] * Bk[0];
425+
C_3_0 += Ak[3] * Bk[0];
426+
427+
C_0_1 += Ak[0] * Bk[1];
428+
C_1_1 += Ak[1] * Bk[1];
429+
C_2_1 += Ak[2] * Bk[1];
430+
C_3_1 += Ak[3] * Bk[1];
431+
432+
C_0_2 += Ak[0] * Bk[2];
433+
C_1_2 += Ak[1] * Bk[2];
434+
C_2_2 += Ak[2] * Bk[2];
435+
C_3_2 += Ak[3] * Bk[2];
436+
437+
C_0_3 += Ak[0] * Bk[3];
438+
C_1_3 += Ak[1] * Bk[3];
439+
C_2_3 += Ak[2] * Bk[3];
440+
C_3_3 += Ak[3] * Bk[3];
441+
442+
asm("");
443+
444+
/*
445+
* Load inputs for (k+2) into registers.
446+
* First load from B.
447+
*/
448+
for (BLASLONG j = 0; j < VEC_COLS; j++)
449+
Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
450+
asm("");
451+
for (BLASLONG i = 0; i < VEC_ROWS; i++)
452+
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
453+
454+
/*
455+
* As above, make sure to first schedule the loads for (k+2)
456+
* before compute for (k+1).
457+
*/
458+
BARRIER_READ_BEFORE_COMPUTE(k);
459+
460+
/* Compute on (k+1) */
461+
C_0_0 += Aknext[0] * Bknext[0];
462+
C_1_0 += Aknext[1] * Bknext[0];
463+
C_2_0 += Aknext[2] * Bknext[0];
464+
C_3_0 += Aknext[3] * Bknext[0];
465+
466+
C_0_1 += Aknext[0] * Bknext[1];
467+
C_1_1 += Aknext[1] * Bknext[1];
468+
C_2_1 += Aknext[2] * Bknext[1];
469+
C_3_1 += Aknext[3] * Bknext[1];
470+
471+
C_0_2 += Aknext[0] * Bknext[2];
472+
C_1_2 += Aknext[1] * Bknext[2];
473+
C_2_2 += Aknext[2] * Bknext[2];
474+
C_3_2 += Aknext[3] * Bknext[2];
475+
476+
C_0_3 += Aknext[0] * Bknext[3];
477+
C_1_3 += Aknext[1] * Bknext[3];
478+
C_2_3 += Aknext[2] * Bknext[3];
479+
C_3_3 += Aknext[3] * Bknext[3];
480+
}
481+
482+
/* Wrapup remaining k's */
483+
for (; k < bk; k++) {
484+
vector_float Ak;
485+
486+
#define COMPUTE_WRAPUP_ROW(ROW) \
487+
Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
488+
C_##ROW##_0 += Ak * B[0 + k * COLS]; \
489+
C_##ROW##_1 += Ak * B[1 + k * COLS]; \
490+
C_##ROW##_2 += Ak * B[2 + k * COLS]; \
491+
C_##ROW##_3 += Ak * B[3 + k * COLS];
492+
493+
COMPUTE_WRAPUP_ROW(0)
494+
COMPUTE_WRAPUP_ROW(1)
495+
COMPUTE_WRAPUP_ROW(2)
496+
COMPUTE_WRAPUP_ROW(3)
497+
#undef COMPUTE_WRAPUP_ROW
498+
}
499+
}
500+
501+
/*
502+
* Unpack row-block of C_aux into outer C_i, multiply by
503+
* alpha and add up (or assign for TRMM).
504+
*/
505+
#define WRITE_BACK_C(ROW, COL) \
506+
do { \
507+
vector_float *Cij = \
508+
(vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
509+
if (trmm) { \
510+
*Cij = alpha * C_##ROW##_##COL; \
511+
} else { \
512+
*Cij += alpha * C_##ROW##_##COL; \
513+
} \
514+
} while (0)
515+
516+
WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
517+
WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
518+
WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
519+
WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
520+
#undef WRITE_BACK_C
521+
522+
#undef ROWS
523+
#undef VEC_ROWS
524+
#undef COLS
525+
#undef VEC_COLS
526+
#undef BARRIER_READ_BEFORE_COMPUTE
527+
}
528+
272529
/**
273530
* Handle calculation for row blocks in C_i of any size by dispatching into
274531
* macro-defined (inline) functions or by deferring to a simple generic
@@ -315,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
315572
}
316573
}
317574

575+
/* Dispatch into the implementation for each block size: */
576+
318577
#define BLOCK(bm, bn) \
319578
if (m == bm && n == bn) { \
320579
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -330,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
330589
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
331590
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
332591

333-
#ifdef DOUBLE
334-
BLOCK(2, 4);
335-
BLOCK(2, 2);
336-
#endif
337-
338-
#undef BLOCK
592+
BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
339593

340-
/* simple implementation for smaller block sizes: */
341-
FLOAT Caux[m][n] __attribute__ ((aligned (16)));
594+
BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
342595

343-
/*
344-
* Peel off first iteration (i.e., column of A) for initializing Caux
345-
*/
346-
for (BLASLONG i = 0; i < m; i++)
347-
for (BLASLONG j = 0; j < n; j++)
348-
Caux[i][j] = A[i] * B[j];
349-
350-
for (BLASLONG kk = 1; kk < k; kk++)
351-
for (BLASLONG i = 0; i < m; i++)
352-
for (BLASLONG j = 0; j < n; j++)
353-
Caux[i][j] += A[i + kk * m] * B[j + kk * n];
354-
355-
for (BLASLONG i = 0; i < m; i++)
356-
for (BLASLONG j = 0; j < n; j++)
357-
if (trmm) {
358-
C[i + j * ldc] = alpha * Caux[i][j];
359-
} else {
360-
C[i + j * ldc] += alpha * Caux[i][j];
361-
}
596+
#undef BLOCK
362597
}
363598

364599
/**

0 commit comments

Comments
 (0)