Skip to content

Commit 803e8d4

Browse files
committed
Move the value assignment of vector x in gemv_n_sve.c to the outermost loop to reduce the repeated data retrieval.
1.Verify correctness using BLAS-Tester 2.Using the built-in benchmark to verify performance, the performance of float and doule type improved by about 60% and about 40% respectively.The test command is: export OMP_NUM_THREADS=1;numactl -C 10 -l ./sgemv.goto 3000 4000 100 export OMP_NUM_THREADS=1;numactl -C 10 -l ./dgemv.goto 3000 4000 100
1 parent 75c6ab4 commit 803e8d4

File tree

1 file changed

+6
-12
lines changed

1 file changed

+6
-12
lines changed

kernel/arm64/gemv_n_sve.c

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
6969
FLOAT *a2_ptr = a + lda * width * 2;
7070

7171
for (j = 0; j < width; j++) {
72-
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
73-
ix = j * inc_x;
74-
75-
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
76-
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
77-
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);
72+
ix = j * inc_x;
7873

74+
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
75+
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
76+
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);
77+
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
7978
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
8079
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
8180
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);
@@ -89,10 +88,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
8988
}
9089

9190
if (i < m) {
92-
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
93-
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
94-
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);
95-
9691
SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
9792
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
9893
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);
@@ -115,17 +110,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
115110
a_ptr = a2_ptr;
116111
for (j = width * 3; j < n; j++) {
117112
ix = j * inc_x;
113+
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
118114
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
119115
SV_TYPE y_vec = svld1(pg_true, y + i);
120-
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
121116
SV_TYPE a_vec = svld1(pg_true, a_ptr + i);
122117
y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec);
123118
svst1(pg_true, y + i, y_vec);
124119
}
125120

126121
if (i < m) {
127122
SV_TYPE y_vec = svld1(pg, y + i);
128-
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
129123
SV_TYPE a_vec = svld1(pg, a_ptr + i);
130124
y_vec = svmla_m(pg, y_vec, a_vec, x_vec);
131125
svst1(pg, y + i, y_vec);

0 commit comments

Comments
 (0)