Skip to content

Commit 41fa113

Browse files
committed
erasure_code: add prefetch and vsetvli optimization
Signed-off-by: Shuo Lv <[email protected]>
1 parent 5e90721 commit 41fa113

15 files changed

+96
-60
lines changed

configure.ac

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ case "${CPU}" in
7171
AM_CONDITIONAL([HAVE_RVV], [false]) rvv=no]
7272
)
7373
if test "x$rvv" = "xyes"; then
74-
CFLAGS+=" -march=rv64gcv"
75-
CCASFLAGS+=" -march=rv64gcv"
74+
CFLAGS+=" -march=rv64gcv_zicbop"
75+
CCASFLAGS+=" -march=rv64gcv_zicbop"
7676
fi
7777
AC_MSG_RESULT([$rvv])
7878
;;

erasure_code/riscv64/gf_2vect_dot_prod_rvv.S

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,10 @@ gf_2vect_dot_prod_rvv:
7474
li t6, 16
7575
blt x_len, t6, .return_fail
7676

77-
vsetvli a5, x0, e8, m1 /* Set vector length to maximum */
78-
77+
vsetvli a5, x0, e8, m1,ta,ma /* Set vector length to maximum */
7978
li x_pos, 0
79+
slli x_vec, x_vec, 3
80+
8081
ld x_dest1, 0(x_dest)
8182
ld x_dest2, 8(x_dest)
8283

@@ -92,15 +93,12 @@ gf_2vect_dot_prod_rvv:
9293

9394
/* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
9495
mv x_tbl1, x_tbl /* reset x_tbl1 */
95-
slli t6, x_vec, 5
96+
slli t6, x_vec, 2
9697
add x_tbl2, x_tbl1, t6 /* reset x_tbl2 */
9798

9899
/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
99100
.Llooprvv_vl_vects:
100101
/* load src data */
101-
slli a6, x_vec_i, 3
102-
add a6,x_src,a6
103-
ld x_ptr, 0(a6)
104102
add x_ptr,x_ptr,x_pos
105103

106104
vle8.v v_src, (x_ptr) /* load from: src base + pos offset */
@@ -120,6 +118,14 @@ gf_2vect_dot_prod_rvv:
120118
vle8.v v_gft2_hi, (x_tbl2)
121119
addi x_tbl2, x_tbl2, 16
122120

121+
prefetch.r 0(x_tbl1)
122+
prefetch.r 0(x_tbl2)
123+
124+
/* calc for next */
125+
addi x_vec_i, x_vec_i, 8 /* move x_vec_i to next */
126+
add a6,x_src,x_vec_i
127+
ld x_ptr, 0(a6)
128+
123129
/* dest 1 */
124130
/* table indexing, ie. gf(2^8) multiplication */
125131
vrgather.vv v26, v_gft1_lo, v_src_lo
@@ -134,8 +140,6 @@ gf_2vect_dot_prod_rvv:
134140
vxor.vv v_dest2, v_dest2, v26
135141
vxor.vv v_dest2, v_dest2, v27
136142

137-
/* calc for next */
138-
addi x_vec_i, x_vec_i, 1 /* move x_vec_i to next */
139143
blt x_vec_i, x_vec, .Llooprvv_vl_vects
140144
/* end of Loop 2 */
141145

erasure_code/riscv64/gf_2vect_mad_rvv.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ gf_2vect_mad_rvv:
7171
li t3, 16
7272
blt x_len, t3, .return_fail
7373

74-
vsetvli t4, x0, e8, m1
74+
vsetvli t4, x0, e8, m1,ta,ma
7575

7676
/* load table 1 */
7777
slli t3, x_vec_i, 5

erasure_code/riscv64/gf_3vect_dot_prod_rvv.S

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,11 @@ gf_3vect_dot_prod_rvv:
8484
sd s0, 0(sp)
8585
sd s1, 8(sp)
8686

87-
vsetvli a7, x0, e8, m1 /* Set vector length to maximum */
88-
87+
vsetvli a7, x0, e8, m1,ta,ma /* Set vector length to maximum */
8988
li x_pos, 0
90-
slli t_offset, x_vec, 5
89+
slli x_vec, x_vec, 3
90+
91+
slli t_offset, x_vec, 2
9192
ld x_dest1, 0(x_dest)
9293
ld x_dest2, 8(x_dest)
9394
ld x_dest3, 16(x_dest)
@@ -101,20 +102,19 @@ gf_3vect_dot_prod_rvv:
101102
vmv.v.i v_dest2, 0
102103
vmv.v.i v_dest3, 0
103104

105+
/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
106+
li x_vec_i, 0
107+
/* load source pointer */
108+
ld x_ptr, 0(x_src)
109+
104110
/* Reset table pointers */
105111
mv x_tbl1, x_tbl
106112
add x_tbl2, x_tbl1, t_offset
107113
add x_tbl3, x_tbl2, t_offset
108114

109-
/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
110-
li x_vec_i, 0
111115
.Lloop_rvv_vl_vects:
112-
/* Load source data */
113-
slli t0, x_vec_i, 3
114-
add t0,x_src,t0
115-
ld x_ptr, 0(t0)
116+
/* load source data */
116117
add x_ptr,x_ptr,x_pos
117-
118118
vle8.v v_src, (x_ptr)
119119

120120
/* Split 4-bit lo; 4-bit hi */
@@ -131,14 +131,22 @@ gf_3vect_dot_prod_rvv:
131131
vle8.v v_gft2_hi, (x_tbl2)
132132
addi x_tbl2, x_tbl2, 16
133133

134+
/* Move to next source vector */
135+
addi x_vec_i, x_vec_i, 8
136+
add t0,x_src,x_vec_i
137+
ld x_ptr, 0(t0)
138+
139+
prefetch.r 0(x_tbl1)
140+
prefetch.r 0(x_tbl2)
134141

135142
/* Load next gf_table's */
136143
vle8.v v_gft3_lo, (x_tbl3)
137144
addi x_tbl3, x_tbl3, 16
138145
vle8.v v_gft3_hi, (x_tbl3)
139146
addi x_tbl3, x_tbl3, 16
147+
prefetch.r 0(x_tbl3)
140148

141-
/* dest 1 */
149+
/* dest 1 */
142150
vrgather.vv v26, v_gft1_lo, v_src_lo
143151
vrgather.vv v27, v_gft1_hi, v_src_hi
144152
vxor.vv v_dest1, v_dest1, v26
@@ -156,9 +164,6 @@ gf_3vect_dot_prod_rvv:
156164
vxor.vv v_dest3, v_dest3, v26
157165
vxor.vv v_dest3, v_dest3, v27
158166

159-
/* Move to next source vector */
160-
addi x_vec_i, x_vec_i, 1
161-
162167
/* Check if we have processed all vectors */
163168
blt x_vec_i, x_vec, .Lloop_rvv_vl_vects
164169

erasure_code/riscv64/gf_3vect_mad_rvv.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ gf_3vect_mad_rvv:
7575
li t4, 16
7676
blt x_len, t4, .return_fail
7777

78-
vsetvli t5, x0, e8, m1
78+
vsetvli t5, x0, e8, m1,ta,ma
7979

8080
/* Load table 1 */
8181
slli t4, x_vec_i, 5

erasure_code/riscv64/gf_4vect_dot_prod_rvv.S

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,13 @@ gf_4vect_dot_prod_rvv:
9191
sd s2, 16(sp)
9292
sd s3, 24(sp)
9393

94-
vsetvli t0, x0, e8, m1 /* Set vector length to maximum */
95-
94+
# vsetvli t0, x0, e8, m1 /* Set vector length to maximum */
95+
vsetvli t0, x0, e8, m1,ta,ma
9696
li x_pos, 0
97-
slli t_offset, x_vec, 5
97+
98+
slli x_vec, x_vec, 3
99+
slli t_offset, x_vec, 2
100+
98101
ld x_dest1, 0(x_dest)
99102
ld x_dest2, 8(x_dest)
100103
ld x_dest3, 16(x_dest)
@@ -111,19 +114,20 @@ gf_4vect_dot_prod_rvv:
111114
vmv.v.i v_dest3, 0
112115
vmv.v.i v_dest4, 0
113116

117+
/* x_vec, number of source vectors (ie. data blocks) */
118+
li x_vec_i, 0
119+
120+
/* load source pointer */
121+
ld x_ptr, 0(x_src)
122+
114123
/* Reset table pointers */
115124
mv x_tbl1, x_tbl
116125
add x_tbl2, x_tbl1, t_offset
117126
add x_tbl3, x_tbl2, t_offset
118127
add x_tbl4, x_tbl3, t_offset
119128

120-
/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
121-
li x_vec_i, 0
122129
.Lloop_rvv_vl_vects:
123130
/* Load source data */
124-
slli a6, x_vec_i, 3
125-
add a6,x_src,a6
126-
ld x_ptr, 0(a6)
127131
add x_ptr,x_ptr,x_pos
128132

129133
vle8.v v_src, (x_ptr)
@@ -142,6 +146,13 @@ gf_4vect_dot_prod_rvv:
142146
vle8.v v_gft2_hi, (x_tbl2)
143147
addi x_tbl2, x_tbl2, 16
144148

149+
prefetch.r 0(x_tbl1)
150+
prefetch.r 0(x_tbl2)
151+
152+
/* Move to next source vector */
153+
addi x_vec_i, x_vec_i, 8
154+
add a6,x_src,x_vec_i
155+
ld x_ptr, 0(a6)
145156

146157
/* Load next gf_table's */
147158
vle8.v v_gft3_lo, (x_tbl3)
@@ -153,6 +164,8 @@ gf_4vect_dot_prod_rvv:
153164
addi x_tbl4, x_tbl4, 16
154165
vle8.v v_gft4_hi, (x_tbl4)
155166
addi x_tbl4, x_tbl4, 16
167+
prefetch.r 0(x_tbl3)
168+
prefetch.r 0(x_tbl4)
156169

157170
/* dest 1 */
158171
vrgather.vv v26, v_gft1_lo, v_src_lo
@@ -178,9 +191,6 @@ gf_4vect_dot_prod_rvv:
178191
vxor.vv v_dest4, v_dest4, v26
179192
vxor.vv v_dest4, v_dest4, v27
180193

181-
/* Move to next source vector */
182-
addi x_vec_i, x_vec_i, 1
183-
184194
/* Check if we have processed all vectors */
185195
blt x_vec_i, x_vec, .Lloop_rvv_vl_vects
186196

@@ -198,7 +208,7 @@ gf_4vect_dot_prod_rvv:
198208
j .Lloop_rvv_vl
199209

200210
.return_pass:
201-
/* restore callee-saved registers */
211+
/* restore callee-saved registers */
202212
ld s0, 0(sp)
203213
ld s1, 8(sp)
204214
ld s2, 16(sp)

erasure_code/riscv64/gf_4vect_mad_rvv.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ gf_4vect_mad_rvv:
7979
li t5, 16
8080
blt x_len, t5, .return_fail
8181

82-
vsetvli t6, x0, e8, m1
82+
vsetvli t6, x0, e8, m1,ta,ma
8383

8484
/* load table 1 */
8585
slli t5, x_vec_i, 5

erasure_code/riscv64/gf_5vect_dot_prod_rvv.S

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,7 @@ gf_5vect_dot_prod_rvv:
9595
sd s4, 32(sp)
9696
sd s5, 40(sp)
9797

98-
vsetvli a5, x0, e8, m1
99-
98+
vsetvli a5, x0, e8, m1,ta,ma
10099
/* Initialize position */
101100
li x_pos, 0
102101

@@ -151,6 +150,9 @@ gf_5vect_dot_prod_rvv:
151150
vle8.v v_gft2_hi, (x_tbl2)
152151
addi x_tbl2, x_tbl2, 16
153152

153+
prefetch.r 0(x_tbl1)
154+
prefetch.r 0(x_tbl2)
155+
154156
/* Move to next source vector */
155157
addi x_vec_i, x_vec_i, 1
156158

@@ -171,6 +173,9 @@ gf_5vect_dot_prod_rvv:
171173
vle8.v v_gft4_hi, (x_tbl4)
172174
addi x_tbl4, x_tbl4, 16
173175

176+
prefetch.r 0(x_tbl3)
177+
prefetch.r 0(x_tbl4)
178+
174179
/* dest 2 */
175180
vrgather.vv v26, v_gft2_lo, v_src_lo
176181
vrgather.vv v27, v_gft2_hi, v_src_hi
@@ -188,6 +193,7 @@ gf_5vect_dot_prod_rvv:
188193
addi x_tbl5, x_tbl5, 16
189194
vle8.v v_gft5_hi, (x_tbl5)
190195
addi x_tbl5, x_tbl5, 16
196+
prefetch.r 0(x_tbl5)
191197

192198
/* dest 4 */
193199
vrgather.vv v26, v_gft4_lo, v_src_lo

erasure_code/riscv64/gf_5vect_mad_rvv.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ gf_5vect_mad_rvv:
8383
li t6, 16
8484
blt x_len, t6, .return_fail
8585

86-
vsetvli a7, x0, e8, m1
86+
vsetvli a7, x0, e8, m1,ta,ma
8787

8888
/* Load table 1 */
8989
slli a6, x_vec_i, 5

erasure_code/riscv64/gf_6vect_dot_prod_rvv.S

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,13 @@ gf_6vect_dot_prod_rvv:
102102
sd s7, 56(sp)
103103

104104
li t0, 0x0F
105-
vsetvli a5, x0, e8, m1
105+
vsetvli a5, x0, e8, m1,ta,ma
106106

107107
/* initialize position */
108108
li x_pos, 0
109109

110+
slli x_vec, x_vec, 3
111+
110112
/* load destination pointers */
111113
ld x_dest1, 0(x14) # a4 is also x14
112114
ld x_dest2, 8(x_dest)
@@ -136,7 +138,7 @@ gf_6vect_dot_prod_rvv:
136138
/* initialize table pointers */
137139
/* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
138140
mv x_tbl1, x_tbl
139-
slli t0, x_vec, 5
141+
slli t0, x_vec, 2
140142
add x_tbl2, x_tbl1, t0
141143
add x_tbl3, x_tbl2, t0
142144
add x_tbl4, x_tbl3, t0
@@ -145,11 +147,7 @@ gf_6vect_dot_prod_rvv:
145147

146148
.Llooprvv_vl_vects:
147149
/* load source data */
148-
slli a6, x_vec_i, 3
149-
add a6,x_src,a6
150-
ld x_ptr, 0(a6)
151150
add x_ptr,x_ptr,x_pos
152-
153151
vle8.v v_src, (x_ptr)
154152

155153

@@ -167,6 +165,13 @@ gf_6vect_dot_prod_rvv:
167165
addi x_tbl2, x_tbl2, 16
168166
vle8.v v_gft2_hi, (x_tbl2)
169167
addi x_tbl2, x_tbl2, 16
168+
prefetch.r 0(x_tbl1)
169+
prefetch.r 0(x_tbl2)
170+
171+
/* load next source pointer */
172+
addi x_vec_i, x_vec_i,8
173+
add a6,x_src,x_vec_i
174+
ld x_ptr, 0(a6)
170175

171176
vle8.v v_gft3_lo, (x_tbl3)
172177
addi x_tbl3, x_tbl3, 16
@@ -178,6 +183,9 @@ gf_6vect_dot_prod_rvv:
178183
vle8.v v_gft4_hi, (x_tbl4)
179184
addi x_tbl4, x_tbl4, 16
180185

186+
prefetch.r 0(x_tbl3)
187+
prefetch.r 0(x_tbl4)
188+
181189
vle8.v v_gft5_lo, (x_tbl5)
182190
addi x_tbl5, x_tbl5, 16
183191
vle8.v v_gft5_hi, (x_tbl5)
@@ -188,6 +196,8 @@ gf_6vect_dot_prod_rvv:
188196
vle8.v v_gft6_hi, (x_tbl6)
189197
addi x_tbl6, x_tbl6, 16
190198

199+
prefetch.r 0(x_tbl5)
200+
prefetch.r 0(x_tbl6)
191201

192202
/* dest 1 */
193203
vrgather.vv v26, v_gft1_lo, v_src_lo
@@ -225,10 +235,6 @@ gf_6vect_dot_prod_rvv:
225235
vxor.vv v_dest6, v_dest6, v26
226236
vxor.vv v_dest6, v_dest6, v27
227237

228-
229-
/* load next source pointer */
230-
addi x_vec_i, x_vec_i,1
231-
232238
/* check if we have processed all vectors */
233239
blt x_vec_i, x_vec, .Llooprvv_vl_vects
234240

0 commit comments

Comments
 (0)