Skip to content

Commit 20245de

Browse files
authored
Merge pull request #2615 from mhillenibm/z14_alignment_hints
s390x: improvise vector alignment hints for older compilers
2 parents ea78106 + 2840432 commit 20245de

File tree

1 file changed

+28
-3
lines changed

1 file changed

+28
-3
lines changed

kernel/zarch/gemm_vec.c

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,32 @@ static const bool backwards = false;
158158

159159
typedef FLOAT vector_float __attribute__ ((vector_size (16)));
160160

161+
/**
162+
* Load a vector into register, and hint on 8-byte alignment to improve
163+
* performance. gcc-9 and newer will create these hints by itself. For older
164+
* compiler versions, use inline assembly to explicitly express the hint.
165+
* Provide explicit hex encoding to cater for binutils versions that do not know
166+
* about vector-load with alignment hints yet.
167+
*
168+
* Note that, for block sizes where we apply vectorization, vectors in A will
169+
* always be 8-byte aligned.
170+
*/
171+
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
172+
vector_float const *restrict addr = (vector_float const *restrict)a;
173+
vector_float y;
174+
175+
#if __GNUC__ < 9
176+
// hex-encode vl %[out],%[addr],3
177+
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
178+
: [ out ] "=v"(y)
179+
: [ addr ] "R"(*addr));
180+
#else
181+
y = *addr;
182+
#endif
183+
184+
return y;
185+
}
186+
161187
/**
162188
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
163189
*
@@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
192218
*/ \
193219
for (BLASLONG k = 0; k < bk; k++) { \
194220
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
195-
vector_float Ak = \
196-
*(vector_float *)(A + i * VLEN_FLOATS + \
197-
k * ROWS); \
221+
vector_float Ak = vec_load_hinted( \
222+
A + i * VLEN_FLOATS + k * ROWS); \
198223
\
199224
for (BLASLONG j = 0; j < COLS; j++) \
200225
Caux[i][j] += Ak * B[j + k * COLS]; \

0 commit comments

Comments
 (0)