@@ -186,126 +186,115 @@ cdecl(gf_4vect_dot_prod_neon):
186
186
add x_tbl3 , x_tbl2 , x_vec , lsl # 2
187
187
add x_tbl4 , x_tbl3 , x_vec , lsl # 2
188
188
mov x_vec_i , # 0
189
- prfm pldl1keep , [ x_tbl1 ]
190
- prfm pldl1keep , [ x_tbl2 ]
191
- prfm pldl1keep , [ x_tbl3 ]
192
- prfm pldl1keep , [ x_tbl4 ]
193
189
194
190
.Lloop64_vects:
195
191
ldr x_ptr , [ x_src , x_vec_i ]
196
192
add x_vec_i , x_vec_i , # 8
197
193
add x_ptr , x_ptr , x_pos
198
194
199
- ldr q_data_0 , [ x_ptr ], # 16
200
- ldr q_data_1 , [ x_ptr ], # 16
195
+ ldp q_data_0 , q_data_1 , [ x_ptr ], # 32
201
196
ldp q_gft1_lo , q_gft1_hi , [ x_tbl1 ], # 32
202
197
ldp q_gft2_lo , q_gft2_hi , [ x_tbl2 ], # 32
203
198
ldp q_gft3_lo , q_gft3_hi , [ x_tbl3 ], # 32
204
199
ldp q_gft4_lo , q_gft4_hi , [ x_tbl4 ], # 32
205
- ldr q_data_2 , [ x_ptr ], # 16
206
- ldr q_data_3 , [ x_ptr ], # 16
207
-
208
- prfm pldl1strm , [ x_ptr ]
209
- prfm pldl1keep , [ x_tbl1 ]
210
- prfm pldl1keep , [ x_tbl2 ]
211
- prfm pldl1keep , [ x_tbl3 ]
212
- prfm pldl1keep , [ x_tbl4 ]
213
200
214
201
/ * data_0 * /
215
202
and v_tmp1.16b , v_data_0.16b , v_mask0f.16b
216
203
ushr v_data_0.16b , v_data_0.16b , # 4
217
204
218
205
tbl v_tmp1_lo.16b , {v_gft1_lo.16b} , v_tmp1.16b
219
206
tbl v_tmp1_hi.16b , {v_gft1_hi.16b} , v_data_0.16b
207
+ tbl v_data_3.16b , {v_gft2_lo.16b} , v_tmp1.16b
208
+ tbl v_data_2.16b , {v_gft2_hi.16b} , v_data_0.16b
209
+
220
210
eor v_p1_0.16b , v_tmp1_lo.16b , v_p1_0.16b
211
+ eor v_p2_0.16b , v_data_3.16b , v_p2_0.16b
221
212
eor v_p1_0.16b , v_p1_0.16b , v_tmp1_hi.16b
222
-
223
- tbl v_tmp1_lo.16b , {v_gft2_lo.16b} , v_tmp1.16b
224
- tbl v_tmp1_hi.16b , {v_gft2_hi.16b} , v_data_0.16b
225
- eor v_p2_0.16b , v_tmp1_lo.16b , v_p2_0.16b
226
- eor v_p2_0.16b , v_p2_0.16b , v_tmp1_hi.16b
213
+ eor v_p2_0.16b , v_p2_0.16b , v_data_2.16b
227
214
228
215
tbl v_tmp1_lo.16b , {v_gft3_lo.16b} , v_tmp1.16b
229
216
tbl v_tmp1_hi.16b , {v_gft3_hi.16b} , v_data_0.16b
217
+ tbl v_data_2.16b , {v_gft4_lo.16b} , v_tmp1.16b
218
+ tbl v_data_3.16b , {v_gft4_hi.16b} , v_data_0.16b
219
+
230
220
eor v_p3_0.16b , v_tmp1_lo.16b , v_p3_0.16b
231
221
eor v_p3_0.16b , v_p3_0.16b , v_tmp1_hi.16b
232
-
233
- tbl v_tmp1_lo.16b , {v_gft4_lo.16b} , v_tmp1.16b
234
- tbl v_tmp1_hi.16b , {v_gft4_hi.16b} , v_data_0.16b
235
- eor v_p4_0.16b , v_tmp1_lo.16b , v_p4_0.16b
236
- eor v_p4_0.16b , v_p4_0.16b , v_tmp1_hi.16b
222
+ eor v_p4_0.16b , v_data_2.16b , v_p4_0.16b
223
+ eor v_p4_0.16b , v_p4_0.16b , v_data_3.16b
237
224
238
225
/ * data_1 * /
239
226
and v_tmp1.16b , v_data_1.16b , v_mask0f.16b
240
227
ushr v_data_1.16b , v_data_1.16b , # 4
241
228
242
229
tbl v_tmp1_lo.16b , {v_gft1_lo.16b} , v_tmp1.16b
243
230
tbl v_tmp1_hi.16b , {v_gft1_hi.16b} , v_data_1.16b
231
+ tbl v_data_2.16b , {v_gft2_lo.16b} , v_tmp1.16b
232
+ tbl v_data_3.16b , {v_gft2_hi.16b} , v_data_1.16b
233
+
244
234
eor v_p1_1.16b , v_tmp1_lo.16b , v_p1_1.16b
235
+ eor v_p2_1.16b , v_data_2.16b , v_p2_1.16b
245
236
eor v_p1_1.16b , v_p1_1.16b , v_tmp1_hi.16b
246
-
247
- tbl v_tmp1_lo.16b , {v_gft2_lo.16b} , v_tmp1.16b
248
- tbl v_tmp1_hi.16b , {v_gft2_hi.16b} , v_data_1.16b
249
- eor v_p2_1.16b , v_tmp1_lo.16b , v_p2_1.16b
250
- eor v_p2_1.16b , v_p2_1.16b , v_tmp1_hi.16b
237
+ eor v_p2_1.16b , v_p2_1.16b , v_data_3.16b
251
238
252
239
tbl v_tmp1_lo.16b , {v_gft3_lo.16b} , v_tmp1.16b
253
240
tbl v_tmp1_hi.16b , {v_gft3_hi.16b} , v_data_1.16b
241
+ tbl v_data_2.16b , {v_gft4_lo.16b} , v_tmp1.16b
242
+ tbl v_data_3.16b , {v_gft4_hi.16b} , v_data_1.16b
243
+
254
244
eor v_p3_1.16b , v_tmp1_lo.16b , v_p3_1.16b
255
245
eor v_p3_1.16b , v_p3_1.16b , v_tmp1_hi.16b
246
+ eor v_p4_1.16b , v_data_2.16b , v_p4_1.16b
247
+ eor v_p4_1.16b , v_p4_1.16b , v_data_3.16b
256
248
257
- tbl v_tmp1_lo.16b , {v_gft4_lo.16b} , v_tmp1.16b
258
- tbl v_tmp1_hi.16b , {v_gft4_hi.16b} , v_data_1.16b
259
- eor v_p4_1.16b , v_tmp1_lo.16b , v_p4_1.16b
260
- eor v_p4_1.16b , v_p4_1.16b , v_tmp1_hi.16b
249
+ ldp q_data_2 , q_data_3 , [ x_ptr ], # 32
261
250
262
251
/ * data_2 * /
263
252
and v_tmp1.16b , v_data_2.16b , v_mask0f.16b
264
253
ushr v_data_2.16b , v_data_2.16b , # 4
265
254
266
255
tbl v_tmp1_lo.16b , {v_gft1_lo.16b} , v_tmp1.16b
267
256
tbl v_tmp1_hi.16b , {v_gft1_hi.16b} , v_data_2.16b
257
+ tbl v_data_0.16b , {v_gft2_lo.16b} , v_tmp1.16b
258
+ tbl v_data_1.16b , {v_gft2_hi.16b} , v_data_2.16b
259
+
268
260
eor v_p1_2.16b , v_tmp1_lo.16b , v_p1_2.16b
261
+ eor v_p2_2.16b , v_data_0.16b , v_p2_2.16b
269
262
eor v_p1_2.16b , v_p1_2.16b , v_tmp1_hi.16b
270
-
271
- tbl v_tmp1_lo.16b , {v_gft2_lo.16b} , v_tmp1.16b
272
- tbl v_tmp1_hi.16b , {v_gft2_hi.16b} , v_data_2.16b
273
- eor v_p2_2.16b , v_tmp1_lo.16b , v_p2_2.16b
274
- eor v_p2_2.16b , v_p2_2.16b , v_tmp1_hi.16b
263
+ eor v_p2_2.16b , v_p2_2.16b , v_data_1.16b
275
264
276
265
tbl v_tmp1_lo.16b , {v_gft3_lo.16b} , v_tmp1.16b
277
266
tbl v_tmp1_hi.16b , {v_gft3_hi.16b} , v_data_2.16b
267
+ tbl v_data_0.16b , {v_gft4_lo.16b} , v_tmp1.16b
268
+ tbl v_data_1.16b , {v_gft4_hi.16b} , v_data_2.16b
269
+
278
270
eor v_p3_2.16b , v_tmp1_lo.16b , v_p3_2.16b
279
271
eor v_p3_2.16b , v_p3_2.16b , v_tmp1_hi.16b
280
-
281
- tbl v_tmp1_lo.16b , {v_gft4_lo.16b} , v_tmp1.16b
282
- tbl v_tmp1_hi.16b , {v_gft4_hi.16b} , v_data_2.16b
283
- eor v_p4_2.16b , v_tmp1_lo.16b , v_p4_2.16b
284
- eor v_p4_2.16b , v_p4_2.16b , v_tmp1_hi.16b
272
+ eor v_p4_2.16b , v_data_0.16b , v_p4_2.16b
273
+ eor v_p4_2.16b , v_p4_2.16b , v_data_1.16b
285
274
286
275
/ * data_3 * /
287
276
and v_tmp1.16b , v_data_3.16b , v_mask0f.16b
288
277
ushr v_data_3.16b , v_data_3.16b , # 4
289
278
290
279
tbl v_tmp1_lo.16b , {v_gft1_lo.16b} , v_tmp1.16b
291
280
tbl v_tmp1_hi.16b , {v_gft1_hi.16b} , v_data_3.16b
281
+ tbl v_data_0.16b , {v_gft2_lo.16b} , v_tmp1.16b
282
+ tbl v_data_1.16b , {v_gft2_hi.16b} , v_data_3.16b
283
+
292
284
eor v_p1_3.16b , v_tmp1_lo.16b , v_p1_3.16b
285
+ eor v_p2_3.16b , v_data_0.16b , v_p2_3.16b
293
286
eor v_p1_3.16b , v_p1_3.16b , v_tmp1_hi.16b
294
-
295
- tbl v_tmp1_lo.16b , {v_gft2_lo.16b} , v_tmp1.16b
296
- tbl v_tmp1_hi.16b , {v_gft2_hi.16b} , v_data_3.16b
297
- eor v_p2_3.16b , v_tmp1_lo.16b , v_p2_3.16b
298
- eor v_p2_3.16b , v_p2_3.16b , v_tmp1_hi.16b
287
+ eor v_p2_3.16b , v_p2_3.16b , v_data_1.16b
299
288
300
289
tbl v_tmp1_lo.16b , {v_gft3_lo.16b} , v_tmp1.16b
301
290
tbl v_tmp1_hi.16b , {v_gft3_hi.16b} , v_data_3.16b
291
+ tbl v_data_0.16b , {v_gft4_lo.16b} , v_tmp1.16b
292
+ tbl v_data_1.16b , {v_gft4_hi.16b} , v_data_3.16b
293
+
302
294
eor v_p3_3.16b , v_tmp1_lo.16b , v_p3_3.16b
303
295
eor v_p3_3.16b , v_p3_3.16b , v_tmp1_hi.16b
304
-
305
- tbl v_tmp1_lo.16b , {v_gft4_lo.16b} , v_tmp1.16b
306
- tbl v_tmp1_hi.16b , {v_gft4_hi.16b} , v_data_3.16b
307
- eor v_p4_3.16b , v_tmp1_lo.16b , v_p4_3.16b
308
- eor v_p4_3.16b , v_p4_3.16b , v_tmp1_hi.16b
296
+ eor v_p4_3.16b , v_data_0.16b , v_p4_3.16b
297
+ eor v_p4_3.16b , v_p4_3.16b , v_data_1.16b
309
298
310
299
cmp x_vec_i , x_vec
311
300
blt .Lloop64_vects
0 commit comments