51
51
static const size_t unroll_m = UNROLL_M ;
52
52
static const size_t unroll_n = UNROLL_N ;
53
53
54
+ /* Handling of triangular matrices */
55
+ #ifdef TRMMKERNEL
56
+ static const bool trmm = true;
57
+ static const bool left =
58
+ #ifdef LEFT
59
+ true;
60
+ #else
61
+ false;
62
+ #endif
63
+
64
+ static const bool backwards =
65
+ #if defined(LEFT ) != defined(TRANSA )
66
+ true;
67
+ #else
68
+ false;
69
+ #endif
70
+
71
+ #else
72
+ static const bool trmm = false;
73
+ static const bool left = false;
74
+ static const bool backwards = false;
75
+ #endif /* TRMMKERNEL */
76
+
54
77
/*
55
78
* Background:
56
79
*
@@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N;
111
134
* vectorization for varying block sizes)
112
135
* - add alpha * row block of C_aux back into C_j.
113
136
*
137
+ * Note that there are additional mechanics for handling triangular matrices,
138
+ * calculating B := alpha (A * B) where either of the matrices A or B can be
139
+ * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
140
+ * optionally be transposed.
141
+ * The code effectively skips an "offset" number of columns in A and rows of B
142
+ * in each block, to save unnecessary work by exploiting the triangular nature.
143
+ * To handle all cases, the code discerns (1) a "left" mode when A is triangular
144
+ * and (2) "forward" / "backwards" modes where only the first "offset"
145
+ * columns/rows of A/B are used or where the first "offset" columns/rows are
146
+ * skipped, respectively.
147
+ *
114
148
* Reference:
115
149
*
116
150
* The summary above is based on staring at various kernel implementations and:
@@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
176
210
vector_float * C_ij = \
177
211
(vector_float * )(C + i * VLEN_FLOATS + \
178
212
j * ldc ); \
179
- * C_ij += alpha * Caux [i ][j ]; \
213
+ if (trmm ) { \
214
+ * C_ij = alpha * Caux [i ][j ]; \
215
+ } else { \
216
+ * C_ij += alpha * Caux [i ][j ]; \
217
+ } \
180
218
} \
181
219
} \
182
220
}
@@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2)
209
247
* @param[inout] C Pointer to current column block (panel) of output matrix C.
210
248
* @param[in] ldc Offset between elements in adjacent columns in C.
211
249
* @param[in] alpha Scalar factor.
250
+ * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
251
+ * @param[in] off Running offset for handling triangular matrices.
212
252
*/
213
253
static inline void GEBP_block (BLASLONG m , BLASLONG n ,
214
254
BLASLONG first_row ,
215
255
const FLOAT * restrict A , BLASLONG k ,
216
256
const FLOAT * restrict B ,
217
257
FLOAT * restrict C , BLASLONG ldc ,
218
- FLOAT alpha )
258
+ FLOAT alpha ,
259
+ BLASLONG offset , BLASLONG off )
219
260
{
261
+ if (trmm && left )
262
+ off = offset + first_row ;
263
+
220
264
A += first_row * k ;
221
265
C += first_row ;
222
266
267
+ if (trmm ) {
268
+ if (backwards ) {
269
+ A += off * m ;
270
+ B += off * n ;
271
+ k -= off ;
272
+ } else {
273
+ if (left ) {
274
+ k = off + m ;
275
+ } else {
276
+ k = off + n ;
277
+ }
278
+ }
279
+ }
280
+
223
281
#define BLOCK (bm , bn ) \
224
282
if (m == bm && n == bn) { \
225
283
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
@@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
253
311
254
312
for (BLASLONG i = 0 ; i < m ; i ++ )
255
313
for (BLASLONG j = 0 ; j < n ; j ++ )
256
- C [i + j * ldc ] += alpha * Caux [i ][j ];
314
+ if (trmm ) {
315
+ C [i + j * ldc ] = alpha * Caux [i ][j ];
316
+ } else {
317
+ C [i + j * ldc ] += alpha * Caux [i ][j ];
318
+ }
257
319
}
258
320
259
321
/**
@@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
268
330
* @param[inout] C Pointer to output matrix C (note: all of it).
269
331
* @param[in] ldc Offset between elements in adjacent columns in C.
270
332
* @param[in] alpha Scalar factor.
333
+ * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
271
334
*/
272
335
static inline void GEBP_column_block (BLASLONG num_cols , BLASLONG first_col ,
273
336
const FLOAT * restrict A , BLASLONG bk ,
274
337
const FLOAT * restrict B , BLASLONG bm ,
275
338
FLOAT * restrict C , BLASLONG ldc ,
276
- FLOAT alpha ) {
339
+ FLOAT alpha ,
340
+ BLASLONG const offset ) {
341
+
277
342
FLOAT * restrict C_i = C + first_col * ldc ;
278
343
/*
279
344
* B is in column-order with n_r packed row elements, which does
@@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
282
347
*/
283
348
const FLOAT * restrict B_i = B + first_col * bk ;
284
349
350
+ BLASLONG off = 0 ;
351
+ if (trmm ) {
352
+ if (left ) {
353
+ off = offset ;
354
+ } else {
355
+ off = - offset + first_col ;
356
+ }
357
+ }
358
+
285
359
/*
286
360
* Calculate C_aux := A * B_j
287
361
* then unpack C_i += alpha * C_aux.
@@ -293,14 +367,17 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
293
367
for (BLASLONG block_size = unroll_m ; block_size > 0 ; block_size /= 2 )
294
368
for (; bm - row >= block_size ; row += block_size )
295
369
GEBP_block (block_size , num_cols , row , A , bk , B_i , C_i ,
296
- ldc , alpha );
370
+ ldc , alpha , offset , off );
297
371
}
298
372
299
373
/**
300
374
* Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
301
375
* where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
302
376
* C are pointers to submatrices of the actual matrices.
303
377
*
378
+ * For triangular matrix multiplication, calculate B := alpha (A * B) where A
379
+ * or B can be triangular (in case of A, the macro LEFT will be defined).
380
+ *
304
381
* @param[in] bm Number of rows in C and A.
305
382
* @param[in] bn Number of columns in C and B.
306
383
* @param[in] bk Number of columns in A and rows in B.
@@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
309
386
* @param[in] bb Pointer to input matrix B.
310
387
* @param[inout] C Pointer to output matrix C.
311
388
* @param[in] ldc Offset between elements in adjacent columns in C.
389
+ * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
312
390
* @returns 0 on success.
313
391
*/
314
392
int CNAME (BLASLONG bm , BLASLONG bn , BLASLONG bk , FLOAT alpha ,
315
393
FLOAT * restrict ba , FLOAT * restrict bb ,
316
- FLOAT * restrict C , BLASLONG ldc )
394
+ FLOAT * restrict C , BLASLONG ldc
395
+ #ifdef TRMMKERNEL
396
+ , BLASLONG offset
397
+ #endif
398
+ )
317
399
{
318
400
if ( (bm == 0 ) || (bn == 0 ) || (bk == 0 ) || (alpha == ZERO ))
319
401
return 0 ;
@@ -326,6 +408,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
326
408
ba = __builtin_assume_aligned (ba , 16 );
327
409
bb = __builtin_assume_aligned (bb , 16 );
328
410
411
+ /*
412
+ * Use offset and off even when compiled as SGEMMKERNEL to simplify
413
+ * function signatures and function calls.
414
+ */
415
+ #ifndef TRMMKERNEL
416
+ BLASLONG const offset = 0 ;
417
+ #endif
418
+
329
419
/*
330
420
* Partition B and C into blocks of n_r (unroll_n) columns, called B_i
331
421
* and C_i. For each partition, calculate C_i += alpha * (A * B_j).
@@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
336
426
BLASLONG col = 0 ;
337
427
for (BLASLONG block_size = unroll_n ; block_size > 0 ; block_size /= 2 )
338
428
for (; bn - col >= block_size ; col += block_size )
339
- GEBP_column_block (block_size , col , ba , bk , bb , bm , C , ldc , alpha );
429
+ GEBP_column_block (block_size , col , ba , bk , bb , bm , C , ldc , alpha , offset );
340
430
341
431
return 0 ;
342
432
}
0 commit comments