7474 init_crc .req w0
7575 buf .req x1
7676 len .req x2
77- fold_consts_ptr .req x3
77+ fold_consts_ptr .req x5
7878
7979 fold_consts .req v10
8080
81- ad .req v14
82-
83- k00_16 .req v15
84- k32_48 .req v16
85-
8681 t3 .req v17
8782 t4 .req v18
8883 t5 .req v19
8984 t6 .req v20
9085 t7 .req v21
9186 t8 .req v22
92- t9 .req v23
93-
94- perm1 .req v24
95- perm2 .req v25
96- perm3 .req v26
97- perm4 .req v27
98-
99- bd1 .req v28
100- bd2 .req v29
101- bd3 .req v30
102- bd4 .req v31
103-
104- .macro __pmull_init_p64
105- .endm
10687
107- .macro __pmull_pre_p64 , bd
108- .endm
109-
110- .macro __pmull_init_p8
111- // k00_16 := 0x0000000000000000_000000000000ffff
112- // k32_48 := 0x00000000ffffffff_0000ffffffffffff
113- movi k32_48.2d , # 0xffffffff
114- mov k32_48.h [ 2 ], k32_48.h [ 0 ]
115- ushr k00_16.2d , k32_48.2d , # 32
116-
117- // prepare the permutation vectors
118- mov_q x5 , 0x080f0e0d0c0b0a09
119- movi perm4.8b , # 8
120- dup perm1.2d , x5
121- eor perm1.16b , perm1.16b , perm4.16b
122- ushr perm2.2d , perm1.2d , # 8
123- ushr perm3.2d , perm1.2d , # 16
124- ushr perm4.2d , perm1.2d , # 24
125- sli perm2.2d , perm1.2d , # 56
126- sli perm3.2d , perm1.2d , # 48
127- sli perm4.2d , perm1.2d , # 40
128-
129- // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
130- movi bd1. 4h , # 8 , lsl # 8
131- orr bd1.2s , # 1 , lsl # 16
132- orr bd1.2s , # 1 , lsl # 24
133- zip1 bd1.16b , bd1.16b , bd1.16b
134- zip1 bd1.16b , bd1.16b , bd1.16b
135- .endm
136-
137- .macro __pmull_pre_p8 , bd
138- tbl bd1.16b , {\bd\().16b} , perm1.16b
139- tbl bd2.16b , {\bd\().16b} , perm2.16b
140- tbl bd3.16b , {\bd\().16b} , perm3.16b
141- tbl bd4.16b , {\bd\().16b} , perm4.16b
142- .endm
143-
144- SYM_FUNC_START_LOCAL(__pmull_p8_core)
145- .L__pmull_p8_core:
146- ext t4.8b , ad.8b , ad.8b , # 1 // A1
147- ext t5.8b , ad.8b , ad.8b , # 2 // A2
148- ext t6.8b , ad.8b , ad.8b , # 3 // A3
149-
150- pmull t4. 8h , t4.8b , fold_consts.8b // F = A1 * B
151- pmull t8. 8h , ad.8b , bd1.8b // E = A * B1
152- pmull t5. 8h , t5.8b , fold_consts.8b // H = A2 * B
153- pmull t7. 8h , ad.8b , bd2.8b // G = A * B2
154- pmull t6. 8h , t6.8b , fold_consts.8b // J = A3 * B
155- pmull t9. 8h , ad.8b , bd3.8b // I = A * B3
156- pmull t3. 8h , ad.8b , bd4.8b // K = A * B4
157- b 0f
158-
159- .L__pmull_p8_core2:
160- tbl t4.16b , {ad.16b} , perm1.16b // A1
161- tbl t5.16b , {ad.16b} , perm2.16b // A2
162- tbl t6.16b , {ad.16b} , perm3.16b // A3
163-
164- pmull2 t4. 8h , t4.16b , fold_consts.16b // F = A1 * B
165- pmull2 t8. 8h , ad.16b , bd1.16b // E = A * B1
166- pmull2 t5. 8h , t5.16b , fold_consts.16b // H = A2 * B
167- pmull2 t7. 8h , ad.16b , bd2.16b // G = A * B2
168- pmull2 t6. 8h , t6.16b , fold_consts.16b // J = A3 * B
169- pmull2 t9. 8h , ad.16b , bd3.16b // I = A * B3
170- pmull2 t3. 8h , ad.16b , bd4.16b // K = A * B4
171-
172- 0 : eor t4.16b , t4.16b , t8.16b // L = E + F
173- eor t5.16b , t5.16b , t7.16b // M = G + H
174- eor t6.16b , t6.16b , t9.16b // N = I + J
175-
176- uzp1 t8.2d , t4.2d , t5.2d
177- uzp2 t4.2d , t4.2d , t5.2d
178- uzp1 t7.2d , t6.2d , t3.2d
179- uzp2 t6.2d , t6.2d , t3.2d
180-
181- // t4 = (L) (P0 + P1) << 8
182- // t5 = (M) (P2 + P3) << 16
183- eor t8.16b , t8.16b , t4.16b
184- and t4.16b , t4.16b , k32_48.16b
185-
186- // t6 = (N) (P4 + P5) << 24
187- // t7 = (K) (P6 + P7) << 32
188- eor t7.16b , t7.16b , t6.16b
189- and t6.16b , t6.16b , k00_16.16b
190-
191- eor t8.16b , t8.16b , t4.16b
192- eor t7.16b , t7.16b , t6.16b
193-
194- zip2 t5.2d , t8.2d , t4.2d
195- zip1 t4.2d , t8.2d , t4.2d
196- zip2 t3.2d , t7.2d , t6.2d
197- zip1 t6.2d , t7.2d , t6.2d
198-
199- ext t4.16b , t4.16b , t4.16b , # 15
200- ext t5.16b , t5.16b , t5.16b , # 14
201- ext t6.16b , t6.16b , t6.16b , # 13
202- ext t3.16b , t3.16b , t3.16b , # 12
203-
204- eor t4.16b , t4.16b , t5.16b
205- eor t6.16b , t6.16b , t3.16b
206- ret
207- SYM_FUNC_END(__pmull_p8_core)
88+ perm .req v27
20889
20990 .macro pmull16x64_p64 , a16 , b64 , c64
21091 pmull2 \c64\().1q , \a16\().2d , \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
266147 * /
267148 .macro pmull16x64_p8 , a16 , b64 , c64
268149 ext t7.16b , \b64\().16b , \b64\().16b , # 1
269- tbl t5.16b , {\a16\().16b} , bd1 .16b
150+ tbl t5.16b , {\a16\().16b} , perm .16b
270151 uzp1 t7.16b , \b64\().16b , t7.16b
271152 bl __pmull_p8_16x64
272153 ext \b64\().16b , t4.16b , t4.16b , # 15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
292173 ret
293174SYM_FUNC_END(__pmull_p8_16x64)
294175
295- .macro __pmull_p8 , rq , ad , bd , i
296- .ifnc \bd , fold_consts
297- .err
298- .endif
299- mov ad.16b , \ad\().16b
300- .ifb \i
301- pmull \rq\(). 8h , \ad\().8b , \bd\().8b // D = A * B
302- .else
303- pmull2 \rq\(). 8h , \ad\().16b , \bd\().16b // D = A * B
304- .endif
305-
306- bl .L__pmull_p8_core\i
307-
308- eor \rq\().16b , \rq\().16b , t4.16b
309- eor \rq\().16b , \rq\().16b , t6.16b
310- .endm
311176
312177 // Fold reg1 , reg2 into the next 32 data bytes , storing the result back
313178 // into reg1 , reg2.
@@ -340,16 +205,7 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
340205 eor \dst_reg\().16b , \dst_reg\().16b , \src_reg\().16b
341206 .endm
342207
343- .macro __pmull_p64 , rd , rn , rm , n
344- .ifb \n
345- pmull \rd\().1q , \rn\().1d , \rm\().1d
346- .else
347- pmull2 \rd\().1q , \rn\().2d , \rm\().2d
348- .endif
349- .endm
350-
351208 .macro crc_t10dif_pmull , p
352- __pmull_init_\p
353209
354210 // For sizes less than 256 bytes , we can't fold 128 bytes at a time.
355211 cmp len , # 256
@@ -479,47 +335,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
479335 pmull16x64_\p fold_consts , v3 , v0
480336 eor v7.16b , v3.16b , v0.16b
481337 eor v7.16b , v7.16b , v2.16b
482-
483- .Lreduce_final_16_bytes_\@:
484- // Reduce the 128 - bit value M(x) , stored in v7 , to the final 16 - bit CRC.
485-
486- movi v2.16b , # 0 // init zero register
487-
488- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
489- ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
490- __pmull_pre_\p fold_consts
491-
492- // Fold the high 64 bits into the low 64 bits , while also multiplying by
493- // x^ 64 . This produces a 128 - bit value congruent to x^ 64 * M(x) and
494- // whose low 48 bits are 0 .
495- ext v0.16b , v2.16b , v7.16b , # 8
496- __pmull_\p v7 , v7 , fold_consts , 2 // high bits * x^ 48 * (x^ 80 mod G(x))
497- eor v0.16b , v0.16b , v7.16b // + low bits * x^ 64
498-
499- // Fold the high 32 bits into the low 96 bits. This produces a 96 - bit
500- // value congruent to x^ 64 * M(x) and whose low 48 bits are 0 .
501- ext v1.16b , v0.16b , v2.16b , # 12 // extract high 32 bits
502- mov v0.s [ 3 ], v2.s [ 0 ] // zero high 32 bits
503- __pmull_\p v1 , v1 , fold_consts // high 32 bits * x^ 48 * (x^ 48 mod G(x))
504- eor v0.16b , v0.16b , v1.16b // + low bits
505-
506- // Load G(x) and floor(x^ 48 / G(x)).
507- ld1 {fold_consts.2d} , [ fold_consts_ptr ]
508- __pmull_pre_\p fold_consts
509-
510- // Use Barrett reduction to compute the final CRC value.
511- __pmull_\p v1 , v0 , fold_consts , 2 // high 32 bits * floor(x^ 48 / G(x))
512- ushr v1.2d , v1.2d , # 32 // /= x^ 32
513- __pmull_\p v1 , v1 , fold_consts // * = G(x)
514- ushr v0.2d , v0.2d , # 48
515- eor v0.16b , v0.16b , v1.16b // + low 16 nonzero bits
516- // Final CRC value (x^ 16 * M(x)) mod G(x) is in low 16 bits of v0.
517-
518- umov w0 , v0.h [ 0 ]
519- .ifc \p , p8
520- frame_pop
521- .endif
522- ret
338+ b .Lreduce_final_16_bytes_\@
523339
524340.Lless_than_256_bytes_\@:
525341 // Checksumming a buffer of length 16 ... 255 bytes
@@ -545,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
545361 b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
546362 add len , len , # 16
547363 b .Lhandle_partial_segment_\@ // 17 <= len <= 31
364+
365+ .Lreduce_final_16_bytes_\@:
548366 .endm
549367
550368//
@@ -554,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
554372//
555373SYM_FUNC_START(crc_t10dif_pmull_p8)
556374 frame_push 1
375+
376+ // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
377+ movi perm. 4h , # 8 , lsl # 8
378+ orr perm.2s , # 1 , lsl # 16
379+ orr perm.2s , # 1 , lsl # 24
380+ zip1 perm.16b , perm.16b , perm.16b
381+ zip1 perm.16b , perm.16b , perm.16b
382+
557383 crc_t10dif_pmull p8
384+
385+ CPU_LE( rev64 v7.16b , v7.16b )
386+ CPU_LE( ext v7.16b , v7.16b , v7.16b , # 8 )
387+ str q7 , [ x3 ]
388+
389+ frame_pop
390+ ret
558391SYM_FUNC_END(crc_t10dif_pmull_p8)
559392
560393 . align 5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
565398//
566399SYM_FUNC_START(crc_t10dif_pmull_p64)
567400 crc_t10dif_pmull p64
401+
402+ // Reduce the 128 - bit value M(x) , stored in v7 , to the final 16 - bit CRC.
403+
404+ movi v2.16b , # 0 // init zero register
405+
406+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
407+ ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
408+
409+ // Fold the high 64 bits into the low 64 bits , while also multiplying by
410+ // x^ 64 . This produces a 128 - bit value congruent to x^ 64 * M(x) and
411+ // whose low 48 bits are 0 .
412+ ext v0.16b , v2.16b , v7.16b , # 8
413+ pmull2 v7.1q , v7.2d , fold_consts.2d // high bits * x^ 48 * (x^ 80 mod G(x))
414+ eor v0.16b , v0.16b , v7.16b // + low bits * x^ 64
415+
416+ // Fold the high 32 bits into the low 96 bits. This produces a 96 - bit
417+ // value congruent to x^ 64 * M(x) and whose low 48 bits are 0 .
418+ ext v1.16b , v0.16b , v2.16b , # 12 // extract high 32 bits
419+ mov v0.s [ 3 ], v2.s [ 0 ] // zero high 32 bits
420+ pmull v1.1q , v1.1d , fold_consts.1d // high 32 bits * x^ 48 * (x^ 48 mod G(x))
421+ eor v0.16b , v0.16b , v1.16b // + low bits
422+
423+ // Load G(x) and floor(x^ 48 / G(x)).
424+ ld1 {fold_consts.2d} , [ fold_consts_ptr ]
425+
426+ // Use Barrett reduction to compute the final CRC value.
427+ pmull2 v1.1q , v0.2d , fold_consts.2d // high 32 bits * floor(x^ 48 / G(x))
428+ ushr v1.2d , v1.2d , # 32 // /= x^ 32
429+ pmull v1.1q , v1.1d , fold_consts.1d // * = G(x)
430+ ushr v0.2d , v0.2d , # 48
431+ eor v0.16b , v0.16b , v1.16b // + low 16 nonzero bits
432+ // Final CRC value (x^ 16 * M(x)) mod G(x) is in low 16 bits of v0.
433+
434+ umov w0 , v0.h [ 0 ]
435+ ret
568436SYM_FUNC_END(crc_t10dif_pmull_p64)
569437
570438 . section ".rodata" , "a"
0 commit comments