@@ -158,6 +158,32 @@ static const bool backwards = false;
158
158
159
159
typedef FLOAT vector_float __attribute__ ((vector_size (16 )));
160
160
161
+ /**
162
+ * Load a vector into register, and hint on 8-byte alignment to improve
163
+ * performance. gcc-9 and newer will create these hints by itself. For older
164
+ * compiler versions, use inline assembly to explicitly express the hint.
165
+ * Provide explicit hex encoding to cater for binutils versions that do not know
166
+ * about vector-load with alignment hints yet.
167
+ *
168
+ * Note that, for block sizes where we apply vectorization, vectors in A will
169
+ * always be 8-byte aligned.
170
+ */
171
+ static inline vector_float vec_load_hinted (FLOAT const * restrict a ) {
172
+ vector_float const * restrict addr = (vector_float const * restrict)a ;
173
+ vector_float y ;
174
+
175
+ #if __GNUC__ < 9
176
+ // hex-encode vl %[out],%[addr],3
177
+ asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
178
+ : [ out ] "=v" (y )
179
+ : [ addr ] "R" (* addr ));
180
+ #else
181
+ y = * addr ;
182
+ #endif
183
+
184
+ return y ;
185
+ }
186
+
161
187
/**
162
188
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
163
189
*
@@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
192
218
*/ \
193
219
for (BLASLONG k = 0 ; k < bk ; k ++ ) { \
194
220
for (BLASLONG i = 0 ; i < ROWS / VLEN_FLOATS ; i ++ ) { \
195
- vector_float Ak = \
196
- * (vector_float * )(A + i * VLEN_FLOATS + \
197
- k * ROWS ); \
221
+ vector_float Ak = vec_load_hinted ( \
222
+ A + i * VLEN_FLOATS + k * ROWS ); \
198
223
\
199
224
for (BLASLONG j = 0 ; j < COLS ; j ++ ) \
200
225
Caux [i ][j ] += Ak * B [j + k * COLS ]; \
0 commit comments