74
74
init_crc .req w0
75
75
buf .req x1
76
76
len .req x2
77
- fold_consts_ptr .req x3
77
+ fold_consts_ptr .req x5
78
78
79
79
fold_consts .req v10
80
80
81
- ad .req v14
82
-
83
- k00_16 .req v15
84
- k32_48 .req v16
85
-
86
81
t3 .req v17
87
82
t4 .req v18
88
83
t5 .req v19
89
84
t6 .req v20
90
85
t7 .req v21
91
86
t8 .req v22
92
- t9 .req v23
93
-
94
- perm1 .req v24
95
- perm2 .req v25
96
- perm3 .req v26
97
- perm4 .req v27
98
-
99
- bd1 .req v28
100
- bd2 .req v29
101
- bd3 .req v30
102
- bd4 .req v31
103
-
104
- .macro __pmull_init_p64
105
- .endm
106
87
107
- .macro __pmull_pre_p64 , bd
108
- .endm
109
-
110
- .macro __pmull_init_p8
111
- // k00_16 := 0x0000000000000000_000000000000ffff
112
- // k32_48 := 0x00000000ffffffff_0000ffffffffffff
113
- movi k32_48.2d , # 0xffffffff
114
- mov k32_48.h [ 2 ], k32_48.h [ 0 ]
115
- ushr k00_16.2d , k32_48.2d , # 32
116
-
117
- // prepare the permutation vectors
118
- mov_q x5 , 0x080f0e0d0c0b0a09
119
- movi perm4.8b , # 8
120
- dup perm1.2d , x5
121
- eor perm1.16b , perm1.16b , perm4.16b
122
- ushr perm2.2d , perm1.2d , # 8
123
- ushr perm3.2d , perm1.2d , # 16
124
- ushr perm4.2d , perm1.2d , # 24
125
- sli perm2.2d , perm1.2d , # 56
126
- sli perm3.2d , perm1.2d , # 48
127
- sli perm4.2d , perm1.2d , # 40
128
-
129
- // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
130
- movi bd1. 4h , # 8 , lsl # 8
131
- orr bd1.2s , # 1 , lsl # 16
132
- orr bd1.2s , # 1 , lsl # 24
133
- zip1 bd1.16b , bd1.16b , bd1.16b
134
- zip1 bd1.16b , bd1.16b , bd1.16b
135
- .endm
136
-
137
- .macro __pmull_pre_p8 , bd
138
- tbl bd1.16b , {\bd\().16b} , perm1.16b
139
- tbl bd2.16b , {\bd\().16b} , perm2.16b
140
- tbl bd3.16b , {\bd\().16b} , perm3.16b
141
- tbl bd4.16b , {\bd\().16b} , perm4.16b
142
- .endm
143
-
144
- SYM_FUNC_START_LOCAL(__pmull_p8_core)
145
- .L__pmull_p8_core:
146
- ext t4.8b , ad.8b , ad.8b , # 1 // A1
147
- ext t5.8b , ad.8b , ad.8b , # 2 // A2
148
- ext t6.8b , ad.8b , ad.8b , # 3 // A3
149
-
150
- pmull t4. 8h , t4.8b , fold_consts.8b // F = A1 * B
151
- pmull t8. 8h , ad.8b , bd1.8b // E = A * B1
152
- pmull t5. 8h , t5.8b , fold_consts.8b // H = A2 * B
153
- pmull t7. 8h , ad.8b , bd2.8b // G = A * B2
154
- pmull t6. 8h , t6.8b , fold_consts.8b // J = A3 * B
155
- pmull t9. 8h , ad.8b , bd3.8b // I = A * B3
156
- pmull t3. 8h , ad.8b , bd4.8b // K = A * B4
157
- b 0f
158
-
159
- .L__pmull_p8_core2:
160
- tbl t4.16b , {ad.16b} , perm1.16b // A1
161
- tbl t5.16b , {ad.16b} , perm2.16b // A2
162
- tbl t6.16b , {ad.16b} , perm3.16b // A3
163
-
164
- pmull2 t4. 8h , t4.16b , fold_consts.16b // F = A1 * B
165
- pmull2 t8. 8h , ad.16b , bd1.16b // E = A * B1
166
- pmull2 t5. 8h , t5.16b , fold_consts.16b // H = A2 * B
167
- pmull2 t7. 8h , ad.16b , bd2.16b // G = A * B2
168
- pmull2 t6. 8h , t6.16b , fold_consts.16b // J = A3 * B
169
- pmull2 t9. 8h , ad.16b , bd3.16b // I = A * B3
170
- pmull2 t3. 8h , ad.16b , bd4.16b // K = A * B4
171
-
172
- 0 : eor t4.16b , t4.16b , t8.16b // L = E + F
173
- eor t5.16b , t5.16b , t7.16b // M = G + H
174
- eor t6.16b , t6.16b , t9.16b // N = I + J
175
-
176
- uzp1 t8.2d , t4.2d , t5.2d
177
- uzp2 t4.2d , t4.2d , t5.2d
178
- uzp1 t7.2d , t6.2d , t3.2d
179
- uzp2 t6.2d , t6.2d , t3.2d
180
-
181
- // t4 = (L) (P0 + P1) << 8
182
- // t5 = (M) (P2 + P3) << 16
183
- eor t8.16b , t8.16b , t4.16b
184
- and t4.16b , t4.16b , k32_48.16b
185
-
186
- // t6 = (N) (P4 + P5) << 24
187
- // t7 = (K) (P6 + P7) << 32
188
- eor t7.16b , t7.16b , t6.16b
189
- and t6.16b , t6.16b , k00_16.16b
190
-
191
- eor t8.16b , t8.16b , t4.16b
192
- eor t7.16b , t7.16b , t6.16b
193
-
194
- zip2 t5.2d , t8.2d , t4.2d
195
- zip1 t4.2d , t8.2d , t4.2d
196
- zip2 t3.2d , t7.2d , t6.2d
197
- zip1 t6.2d , t7.2d , t6.2d
198
-
199
- ext t4.16b , t4.16b , t4.16b , # 15
200
- ext t5.16b , t5.16b , t5.16b , # 14
201
- ext t6.16b , t6.16b , t6.16b , # 13
202
- ext t3.16b , t3.16b , t3.16b , # 12
203
-
204
- eor t4.16b , t4.16b , t5.16b
205
- eor t6.16b , t6.16b , t3.16b
206
- ret
207
- SYM_FUNC_END(__pmull_p8_core)
88
+ perm .req v27
208
89
209
90
.macro pmull16x64_p64 , a16 , b64 , c64
210
91
pmull2 \c64\().1q , \a16\().2d , \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
266
147
* /
267
148
.macro pmull16x64_p8 , a16 , b64 , c64
268
149
ext t7.16b , \b64\().16b , \b64\().16b , # 1
269
- tbl t5.16b , {\a16\().16b} , bd1 .16b
150
+ tbl t5.16b , {\a16\().16b} , perm .16b
270
151
uzp1 t7.16b , \b64\().16b , t7.16b
271
152
bl __pmull_p8_16x64
272
153
ext \b64\().16b , t4.16b , t4.16b , # 15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
292
173
ret
293
174
SYM_FUNC_END(__pmull_p8_16x64)
294
175
295
- .macro __pmull_p8 , rq , ad , bd , i
296
- .ifnc \bd , fold_consts
297
- .err
298
- .endif
299
- mov ad.16b , \ad\().16b
300
- .ifb \i
301
- pmull \rq\(). 8h , \ad\().8b , \bd\().8b // D = A * B
302
- .else
303
- pmull2 \rq\(). 8h , \ad\().16b , \bd\().16b // D = A * B
304
- .endif
305
-
306
- bl .L__pmull_p8_core\i
307
-
308
- eor \rq\().16b , \rq\().16b , t4.16b
309
- eor \rq\().16b , \rq\().16b , t6.16b
310
- .endm
311
176
312
177
// Fold reg1 , reg2 into the next 32 data bytes , storing the result back
313
178
// into reg1 , reg2.
@@ -340,16 +205,7 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
340
205
eor \dst_reg\().16b , \dst_reg\().16b , \src_reg\().16b
341
206
.endm
342
207
343
- .macro __pmull_p64 , rd , rn , rm , n
344
- .ifb \n
345
- pmull \rd\().1q , \rn\().1d , \rm\().1d
346
- .else
347
- pmull2 \rd\().1q , \rn\().2d , \rm\().2d
348
- .endif
349
- .endm
350
-
351
208
.macro crc_t10dif_pmull , p
352
- __pmull_init_\p
353
209
354
210
// For sizes less than 256 bytes , we can't fold 128 bytes at a time.
355
211
cmp len , # 256
@@ -479,47 +335,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
479
335
pmull16x64_\p fold_consts , v3 , v0
480
336
eor v7.16b , v3.16b , v0.16b
481
337
eor v7.16b , v7.16b , v2.16b
482
-
483
- .Lreduce_final_16_bytes_\@:
484
- // Reduce the 128 - bit value M(x) , stored in v7 , to the final 16 - bit CRC.
485
-
486
- movi v2.16b , # 0 // init zero register
487
-
488
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
489
- ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
490
- __pmull_pre_\p fold_consts
491
-
492
- // Fold the high 64 bits into the low 64 bits , while also multiplying by
493
- // x^ 64 . This produces a 128 - bit value congruent to x^ 64 * M(x) and
494
- // whose low 48 bits are 0 .
495
- ext v0.16b , v2.16b , v7.16b , # 8
496
- __pmull_\p v7 , v7 , fold_consts , 2 // high bits * x^ 48 * (x^ 80 mod G(x))
497
- eor v0.16b , v0.16b , v7.16b // + low bits * x^ 64
498
-
499
- // Fold the high 32 bits into the low 96 bits. This produces a 96 - bit
500
- // value congruent to x^ 64 * M(x) and whose low 48 bits are 0 .
501
- ext v1.16b , v0.16b , v2.16b , # 12 // extract high 32 bits
502
- mov v0.s [ 3 ], v2.s [ 0 ] // zero high 32 bits
503
- __pmull_\p v1 , v1 , fold_consts // high 32 bits * x^ 48 * (x^ 48 mod G(x))
504
- eor v0.16b , v0.16b , v1.16b // + low bits
505
-
506
- // Load G(x) and floor(x^ 48 / G(x)).
507
- ld1 {fold_consts.2d} , [ fold_consts_ptr ]
508
- __pmull_pre_\p fold_consts
509
-
510
- // Use Barrett reduction to compute the final CRC value.
511
- __pmull_\p v1 , v0 , fold_consts , 2 // high 32 bits * floor(x^ 48 / G(x))
512
- ushr v1.2d , v1.2d , # 32 // /= x^ 32
513
- __pmull_\p v1 , v1 , fold_consts // * = G(x)
514
- ushr v0.2d , v0.2d , # 48
515
- eor v0.16b , v0.16b , v1.16b // + low 16 nonzero bits
516
- // Final CRC value (x^ 16 * M(x)) mod G(x) is in low 16 bits of v0.
517
-
518
- umov w0 , v0.h [ 0 ]
519
- .ifc \p , p8
520
- frame_pop
521
- .endif
522
- ret
338
+ b .Lreduce_final_16_bytes_\@
523
339
524
340
.Lless_than_256_bytes_\@:
525
341
// Checksumming a buffer of length 16 ... 255 bytes
@@ -545,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
545
361
b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
546
362
add len , len , # 16
547
363
b .Lhandle_partial_segment_\@ // 17 <= len <= 31
364
+
365
+ .Lreduce_final_16_bytes_\@:
548
366
.endm
549
367
550
368
//
@@ -554,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
554
372
//
555
373
SYM_FUNC_START(crc_t10dif_pmull_p8)
556
374
frame_push 1
375
+
376
+ // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
377
+ movi perm. 4h , # 8 , lsl # 8
378
+ orr perm.2s , # 1 , lsl # 16
379
+ orr perm.2s , # 1 , lsl # 24
380
+ zip1 perm.16b , perm.16b , perm.16b
381
+ zip1 perm.16b , perm.16b , perm.16b
382
+
557
383
crc_t10dif_pmull p8
384
+
385
+ CPU_LE( rev64 v7.16b , v7.16b )
386
+ CPU_LE( ext v7.16b , v7.16b , v7.16b , # 8 )
387
+ str q7 , [ x3 ]
388
+
389
+ frame_pop
390
+ ret
558
391
SYM_FUNC_END(crc_t10dif_pmull_p8)
559
392
560
393
. align 5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
565
398
//
566
399
SYM_FUNC_START(crc_t10dif_pmull_p64)
567
400
crc_t10dif_pmull p64
401
+
402
+ // Reduce the 128 - bit value M(x) , stored in v7 , to the final 16 - bit CRC.
403
+
404
+ movi v2.16b , # 0 // init zero register
405
+
406
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
407
+ ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
408
+
409
+ // Fold the high 64 bits into the low 64 bits , while also multiplying by
410
+ // x^ 64 . This produces a 128 - bit value congruent to x^ 64 * M(x) and
411
+ // whose low 48 bits are 0 .
412
+ ext v0.16b , v2.16b , v7.16b , # 8
413
+ pmull2 v7.1q , v7.2d , fold_consts.2d // high bits * x^ 48 * (x^ 80 mod G(x))
414
+ eor v0.16b , v0.16b , v7.16b // + low bits * x^ 64
415
+
416
+ // Fold the high 32 bits into the low 96 bits. This produces a 96 - bit
417
+ // value congruent to x^ 64 * M(x) and whose low 48 bits are 0 .
418
+ ext v1.16b , v0.16b , v2.16b , # 12 // extract high 32 bits
419
+ mov v0.s [ 3 ], v2.s [ 0 ] // zero high 32 bits
420
+ pmull v1.1q , v1.1d , fold_consts.1d // high 32 bits * x^ 48 * (x^ 48 mod G(x))
421
+ eor v0.16b , v0.16b , v1.16b // + low bits
422
+
423
+ // Load G(x) and floor(x^ 48 / G(x)).
424
+ ld1 {fold_consts.2d} , [ fold_consts_ptr ]
425
+
426
+ // Use Barrett reduction to compute the final CRC value.
427
+ pmull2 v1.1q , v0.2d , fold_consts.2d // high 32 bits * floor(x^ 48 / G(x))
428
+ ushr v1.2d , v1.2d , # 32 // /= x^ 32
429
+ pmull v1.1q , v1.1d , fold_consts.1d // * = G(x)
430
+ ushr v0.2d , v0.2d , # 48
431
+ eor v0.16b , v0.16b , v1.16b // + low 16 nonzero bits
432
+ // Final CRC value (x^ 16 * M(x)) mod G(x) is in low 16 bits of v0.
433
+
434
+ umov w0 , v0.h [ 0 ]
435
+ ret
568
436
SYM_FUNC_END(crc_t10dif_pmull_p64)
569
437
570
438
. section ".rodata" , "a"
0 commit comments