@@ -91,10 +91,13 @@ gf_4vect_dot_prod_rvv:
91
91
sd s2, 16 (sp)
92
92
sd s3, 24 (sp)
93
93
94
- vsetvli t0, x0, e8, m1 /* Set vector length to maximum */
95
-
94
+ # vsetvli t0, x0, e8, m1 /* Set vector length to maximum */
95
+ vsetvli t0, x0, e8, m1
96
96
li x_pos, 0
97
- slli t_offset, x_vec, 5
97
+
98
+ slli x_vec, x_vec, 3
99
+ slli t_offset, x_vec, 2
100
+
98
101
ld x_dest1, 0 (x_dest)
99
102
ld x_dest2, 8 (x_dest)
100
103
ld x_dest3, 16 (x_dest)
@@ -111,19 +114,20 @@ gf_4vect_dot_prod_rvv:
111
114
vmv.v.i v_dest3, 0
112
115
vmv.v.i v_dest4, 0
113
116
117
+ /* x_vec, number of source vectors (ie. data blocks) */
118
+ li x_vec_i, 0
119
+
120
+ /* load source pointer */
121
+ ld x_ptr, 0 (x_src)
122
+
114
123
/* Reset table pointers */
115
124
mv x_tbl1, x_tbl
116
125
add x_tbl2, x_tbl1, t_offset
117
126
add x_tbl3, x_tbl2, t_offset
118
127
add x_tbl4, x_tbl3, t_offset
119
128
120
- /* Loop 2: x_vec, number of source vectors (ie. data blocks) */
121
- li x_vec_i, 0
122
129
.Lloop_rvv_vl_vects:
123
130
/* Load source data */
124
- slli a6, x_vec_i, 3
125
- add a6,x_src,a6
126
- ld x_ptr, 0 (a6)
127
131
add x_ptr,x_ptr,x_pos
128
132
129
133
vle8.v v_src, (x_ptr)
@@ -142,6 +146,13 @@ gf_4vect_dot_prod_rvv:
142
146
vle8.v v_gft2_hi, (x_tbl2)
143
147
addi x_tbl2, x_tbl2, 16
144
148
149
+ prefetch .r 0 (x_tbl1)
150
+ prefetch .r 0 (x_tbl2)
151
+
152
+ /* Move to next source vector */
153
+ addi x_vec_i, x_vec_i, 8
154
+ add a6,x_src,x_vec_i
155
+ ld x_ptr, 0 (a6)
145
156
146
157
/* Load next gf_table's */
147
158
vle8.v v_gft3_lo, (x_tbl3)
@@ -153,6 +164,8 @@ gf_4vect_dot_prod_rvv:
153
164
addi x_tbl4, x_tbl4, 16
154
165
vle8.v v_gft4_hi, (x_tbl4)
155
166
addi x_tbl4, x_tbl4, 16
167
+ prefetch .r 0 (x_tbl3)
168
+ prefetch .r 0 (x_tbl4)
156
169
157
170
/* dest 1 */
158
171
vrgather.vv v26, v_gft1_lo, v_src_lo
@@ -178,9 +191,6 @@ gf_4vect_dot_prod_rvv:
178
191
vxor.vv v_dest4, v_dest4, v26
179
192
vxor.vv v_dest4, v_dest4, v27
180
193
181
- /* Move to next source vector */
182
- addi x_vec_i, x_vec_i, 1
183
-
184
194
/* Check if we have processed all vectors */
185
195
blt x_vec_i, x_vec, .Lloop_rvv_vl_vects
186
196
@@ -198,7 +208,7 @@ gf_4vect_dot_prod_rvv:
198
208
j .Lloop_rvv_vl
199
209
200
210
.return_pass:
201
- /* restore callee-saved registers */
211
+ /* restore callee-saved registers */
202
212
ld s0, 0 (sp)
203
213
ld s1, 8 (sp)
204
214
ld s2, 16 (sp)
0 commit comments