Skip to content

Commit 71b6eaf

Browse files
committed
s390x: Use new sgemm kernel also for strmm on Z14 and newer
Employ the newly added GEMM kernel also for STRMM on Z14. The implementation in C with vector intrinsics exploits FP32 SIMD operations and thereby gains performance over the existing assembly code. Extend the implementation for handling triangular matrix multiplication, accordingly. As added benefit, the more flexible C code enables us to adjust register blocking in the subsequent commit. Tested via make -C test / ctest / utest and by a couple of additional unit tests that exercise blocking. Signed-off-by: Marius Hillenbrand <[email protected]>
1 parent 43c0d4f commit 71b6eaf

File tree

2 files changed

+98
-14
lines changed

2 files changed

+98
-14
lines changed

kernel/zarch/KERNEL.Z14

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ DGEMVTKERNEL = dgemv_t_4.c
8686
CGEMVTKERNEL = cgemv_t_4.c
8787
ZGEMVTKERNEL = zgemv_t_4.c
8888

89-
STRMMKERNEL = strmm8x4V.S
89+
STRMMKERNEL = gemm_vec.c
9090
DTRMMKERNEL = trmm8x4V.S
9191
CTRMMKERNEL = ctrmm4x4V.S
9292
ZTRMMKERNEL = ztrmm4x4V.S
@@ -101,8 +101,6 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
101101
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
102102
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
103103

104-
105-
106104
DGEMMKERNEL = gemm8x4V.S
107105
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
108106
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
@@ -145,7 +143,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
145143
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
146144
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
147145

148-
149-
150-
151-

kernel/zarch/gemm_vec.c

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,29 @@
5151
static const size_t unroll_m = UNROLL_M;
5252
static const size_t unroll_n = UNROLL_N;
5353

54+
/* Handling of triangular matrices */
55+
#ifdef TRMMKERNEL
56+
static const bool trmm = true;
57+
static const bool left =
58+
#ifdef LEFT
59+
true;
60+
#else
61+
false;
62+
#endif
63+
64+
static const bool backwards =
65+
#if defined(LEFT) != defined(TRANSA)
66+
true;
67+
#else
68+
false;
69+
#endif
70+
71+
#else
72+
static const bool trmm = false;
73+
static const bool left = false;
74+
static const bool backwards = false;
75+
#endif /* TRMMKERNEL */
76+
5477
/*
5578
* Background:
5679
*
@@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N;
111134
* vectorization for varying block sizes)
112135
* - add alpha * row block of C_aux back into C_j.
113136
*
137+
* Note that there are additional mechanics for handling triangular matrices,
138+
* calculating B := alpha (A * B) where either of the matrices A or B can be
139+
* triangular. In case of A, the macro "LEFT" is defined. In addition, A can
140+
* optionally be transposed.
141+
* The code effectively skips an "offset" number of columns in A and rows of B
142+
* in each block, to save unnecessary work by exploiting the triangular nature.
143+
* To handle all cases, the code discerns (1) a "left" mode when A is triangular
144+
* and (2) "forward" / "backwards" modes where only the first "offset"
145+
* columns/rows of A/B are used or where the first "offset" columns/rows are
146+
* skipped, respectively.
147+
*
114148
* Reference:
115149
*
116150
* The summary above is based on staring at various kernel implementations and:
@@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
176210
vector_float *C_ij = \
177211
(vector_float *)(C + i * VLEN_FLOATS + \
178212
j * ldc); \
179-
*C_ij += alpha * Caux[i][j]; \
213+
if (trmm) { \
214+
*C_ij = alpha * Caux[i][j]; \
215+
} else { \
216+
*C_ij += alpha * Caux[i][j]; \
217+
} \
180218
} \
181219
} \
182220
}
@@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2)
209247
* @param[inout] C Pointer to current column block (panel) of output matrix C.
210248
* @param[in] ldc Offset between elements in adjacent columns in C.
211249
* @param[in] alpha Scalar factor.
250+
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
251+
* @param[in] off Running offset for handling triangular matrices.
212252
*/
213253
static inline void GEBP_block(BLASLONG m, BLASLONG n,
214254
BLASLONG first_row,
215255
const FLOAT * restrict A, BLASLONG k,
216256
const FLOAT * restrict B,
217257
FLOAT *restrict C, BLASLONG ldc,
218-
FLOAT alpha)
258+
FLOAT alpha,
259+
BLASLONG offset, BLASLONG off)
219260
{
261+
if (trmm && left)
262+
off = offset + first_row;
263+
220264
A += first_row * k;
221265
C += first_row;
222266

267+
if (trmm) {
268+
if (backwards) {
269+
A += off * m;
270+
B += off * n;
271+
k -= off;
272+
} else {
273+
if (left) {
274+
k = off + m;
275+
} else {
276+
k = off + n;
277+
}
278+
}
279+
}
280+
223281
#define BLOCK(bm, bn) \
224282
if (m == bm && n == bn) { \
225283
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
253311

254312
for (BLASLONG i = 0; i < m; i++)
255313
for (BLASLONG j = 0; j < n; j++)
256-
C[i + j * ldc] += alpha * Caux[i][j];
314+
if (trmm) {
315+
C[i + j * ldc] = alpha * Caux[i][j];
316+
} else {
317+
C[i + j * ldc] += alpha * Caux[i][j];
318+
}
257319
}
258320

259321
/**
@@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
268330
* @param[inout] C Pointer to output matrix C (note: all of it).
269331
* @param[in] ldc Offset between elements in adjacent columns in C.
270332
* @param[in] alpha Scalar factor.
333+
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
271334
*/
272335
static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
273336
const FLOAT *restrict A, BLASLONG bk,
274337
const FLOAT *restrict B, BLASLONG bm,
275338
FLOAT *restrict C, BLASLONG ldc,
276-
FLOAT alpha) {
339+
FLOAT alpha,
340+
BLASLONG const offset) {
341+
277342
FLOAT *restrict C_i = C + first_col * ldc;
278343
/*
279344
* B is in column-order with n_r packed row elements, which does
@@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
282347
*/
283348
const FLOAT *restrict B_i = B + first_col * bk;
284349

350+
BLASLONG off = 0;
351+
if (trmm) {
352+
if (left) {
353+
off = offset;
354+
} else {
355+
off = -offset + first_col;
356+
}
357+
}
358+
285359
/*
286360
* Calculate C_aux := A * B_j
287361
* then unpack C_i += alpha * C_aux.
@@ -293,14 +367,17 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
293367
for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
294368
for (; bm - row >= block_size; row += block_size)
295369
GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
296-
ldc, alpha);
370+
ldc, alpha, offset, off);
297371
}
298372

299373
/**
300374
* Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
301375
* where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
302376
* C are pointers to submatrices of the actual matrices.
303377
*
378+
* For triangular matrix multiplication, calculate B := alpha (A * B) where A
379+
* or B can be triangular (in case of A, the macro LEFT will be defined).
380+
*
304381
* @param[in] bm Number of rows in C and A.
305382
* @param[in] bn Number of columns in C and B.
306383
* @param[in] bk Number of columns in A and rows in B.
@@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
309386
* @param[in] bb Pointer to input matrix B.
310387
* @param[inout] C Pointer to output matrix C.
311388
* @param[in] ldc Offset between elements in adjacent columns in C.
389+
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
312390
* @returns 0 on success.
313391
*/
314392
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
315393
FLOAT *restrict ba, FLOAT *restrict bb,
316-
FLOAT *restrict C, BLASLONG ldc)
394+
FLOAT *restrict C, BLASLONG ldc
395+
#ifdef TRMMKERNEL
396+
, BLASLONG offset
397+
#endif
398+
)
317399
{
318400
if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
319401
return 0;
@@ -326,6 +408,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
326408
ba = __builtin_assume_aligned(ba, 16);
327409
bb = __builtin_assume_aligned(bb, 16);
328410

411+
/*
412+
* Use offset and off even when compiled as SGEMMKERNEL to simplify
413+
* function signatures and function calls.
414+
*/
415+
#ifndef TRMMKERNEL
416+
BLASLONG const offset = 0;
417+
#endif
418+
329419
/*
330420
* Partition B and C into blocks of n_r (unroll_n) columns, called B_i
331421
* and C_i. For each partition, calculate C_i += alpha * (A * B_j).
@@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
336426
BLASLONG col = 0;
337427
for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
338428
for (; bn - col >= block_size; col += block_size)
339-
GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha);
429+
GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
340430

341431
return 0;
342432
}

0 commit comments

Comments
 (0)