112112 FOLD_CONST_L .req q10l
113113 FOLD_CONST_H .req q10h
114114
115+ .macro pmull16x64_p64 , v16 , v64
116+ vmull.p64 q11 , \v64\()l , \v16\()_L
117+ vmull.p64 \v64 , \v64\()h , \v16\()_H
118+ veor \v64 , \v64 , q11
119+ .endm
120+
115121 // Fold reg1 , reg2 into the next 32 data bytes , storing the result back
116122 // into reg1 , reg2.
117- .macro fold_32_bytes , reg1 , reg2
118- vld1. 64 {q11 - q12 }, [ buf ] !
123+ .macro fold_32_bytes , reg1 , reg2 , p
124+ vld1. 64 {q8 - q9 }, [ buf ] !
119125
120- vmull.p64 q8 , \reg1\()h , FOLD_CONST_H
121- vmull.p64 \reg1 , \reg1\()l , FOLD_CONST_L
122- vmull.p64 q9 , \reg2\()h , FOLD_CONST_H
123- vmull.p64 \reg2 , \reg2\()l , FOLD_CONST_L
126+ pmull16x64_\p FOLD_CONST , \reg1
127+ pmull16x64_\p FOLD_CONST , \reg2
124128
125- CPU_LE( vrev64. 8 q11 , q11 )
126- CPU_LE( vrev64. 8 q12 , q12 )
127- vswp q11l , q11h
128- vswp q12l , q12h
129+ CPU_LE( vrev64. 8 q8 , q8 )
130+ CPU_LE( vrev64. 8 q9 , q9 )
131+ vswp q8l , q8h
132+ vswp q9l , q9h
129133
130134 veor. 8 \reg1 , \reg1 , q8
131135 veor. 8 \reg2 , \reg2 , q9
132- veor. 8 \reg1 , \reg1 , q11
133- veor. 8 \reg2 , \reg2 , q12
134136 .endm
135137
136138 // Fold src_reg into dst_reg , optionally loading the next fold constants
137- .macro fold_16_bytes , src_reg , dst_reg , load_next_consts
138- vmull.p64 q8 , \src_reg\()l , FOLD_CONST_L
139- vmull.p64 \src_reg , \src_reg\()h , FOLD_CONST_H
139+ .macro fold_16_bytes , src_reg , dst_reg , p , load_next_consts
140+ pmull16x64_\p FOLD_CONST , \src_reg
140141 .ifnb \load_next_consts
141142 vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
142143 .endif
143- veor. 8 \dst_reg , \dst_reg , q8
144144 veor. 8 \dst_reg , \dst_reg , \src_reg
145145 .endm
146146
147- //
148- // u16 crc_t10dif_pmull(u16 init_crc , const u8 * buf , size_t len) ;
149- //
150- // Assumes len >= 16 .
151- //
152- ENTRY(crc_t10dif_pmull)
153-
147+ .macro crct10dif , p
154148 // For sizes less than 256 bytes , we can't fold 128 bytes at a time.
155149 cmp len , # 256
156- blt .Lless_than_256_bytes
150+ blt .Lless_than_256_bytes\@
157151
158152 mov_l fold_consts_ptr , .Lfold_across_128_bytes_consts
159153
@@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 )
194188
195189 // While >= 128 data bytes remain ( not counting q0 - q7) , fold the 128
196190 // bytes q0 - q7 into them , storing the result back into q0 - q7.
197- .Lfold_128_bytes_loop:
198- fold_32_bytes q0 , q1
199- fold_32_bytes q2 , q3
200- fold_32_bytes q4 , q5
201- fold_32_bytes q6 , q7
191+ .Lfold_128_bytes_loop\@ :
192+ fold_32_bytes q0 , q1 , \p
193+ fold_32_bytes q2 , q3 , \p
194+ fold_32_bytes q4 , q5 , \p
195+ fold_32_bytes q6 , q7 , \p
202196 subs len , len , # 128
203- bge .Lfold_128_bytes_loop
197+ bge .Lfold_128_bytes_loop\@
204198
205199 // Now fold the 112 bytes in q0 - q6 into the 16 bytes in q7.
206200
207201 // Fold across 64 bytes.
208202 vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
209- fold_16_bytes q0 , q4
210- fold_16_bytes q1 , q5
211- fold_16_bytes q2 , q6
212- fold_16_bytes q3 , q7 , 1
203+ fold_16_bytes q0 , q4 , \p
204+ fold_16_bytes q1 , q5 , \p
205+ fold_16_bytes q2 , q6 , \p
206+ fold_16_bytes q3 , q7 , \p , 1
213207 // Fold across 32 bytes.
214- fold_16_bytes q4 , q6
215- fold_16_bytes q5 , q7 , 1
208+ fold_16_bytes q4 , q6 , \p
209+ fold_16_bytes q5 , q7 , \p , 1
216210 // Fold across 16 bytes.
217- fold_16_bytes q6 , q7
211+ fold_16_bytes q6 , q7 , \p
218212
219213 // Add 128 to get the correct number of data bytes remaining in 0 ... 127
220214 // ( not counting q7) , following the previous extra subtraction by 128 .
@@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 )
224218
225219 // While >= 16 data bytes remain ( not counting q7) , fold the 16 bytes q7
226220 // into them , storing the result back into q7.
227- blt .Lfold_16_bytes_loop_done
228- .Lfold_16_bytes_loop:
229- vmull.p64 q8 , q7l , FOLD_CONST_L
230- vmull.p64 q7 , q7h , FOLD_CONST_H
231- veor. 8 q7 , q7 , q8
221+ blt .Lfold_16_bytes_loop_done\@
222+ .Lfold_16_bytes_loop\@:
223+ pmull16x64_\p FOLD_CONST , q7
232224 vld1. 64 {q0} , [ buf ] !
233225CPU_LE( vrev64. 8 q0 , q0 )
234226 vswp q0l , q0h
235227 veor. 8 q7 , q7 , q0
236228 subs len , len , # 16
237- bge .Lfold_16_bytes_loop
229+ bge .Lfold_16_bytes_loop\@
238230
239- .Lfold_16_bytes_loop_done:
231+ .Lfold_16_bytes_loop_done\@ :
240232 // Add 16 to get the correct number of data bytes remaining in 0 ... 15
241233 // ( not counting q7) , following the previous extra subtraction by 16 .
242234 adds len , len , # 16
243- beq .Lreduce_final_16_bytes
235+ beq .Lreduce_final_16_bytes\@
244236
245- .Lhandle_partial_segment:
237+ .Lhandle_partial_segment\@ :
246238 // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
247239 // 16 bytes are in q7 and the rest are the remaining data in 'buf' . To
248240 // do this without needing a fold constant for each possible 'len' ,
@@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 )
277269 vbsl. 8 q2 , q1 , q0
278270
279271 // Fold the first chunk into the second chunk , storing the result in q7.
280- vmull.p64 q0 , q3l , FOLD_CONST_L
281- vmull.p64 q7 , q3h , FOLD_CONST_H
282- veor. 8 q7 , q7 , q0
283- veor. 8 q7 , q7 , q2
272+ pmull16x64_\p FOLD_CONST , q3
273+ veor. 8 q7 , q3 , q2
274+ b .Lreduce_final_16_bytes\@
275+
276+ .Lless_than_256_bytes\@:
277+ // Checksumming a buffer of length 16 ... 255 bytes
278+
279+ mov_l fold_consts_ptr , .Lfold_across_16_bytes_consts
280+
281+ // Load the first 16 data bytes.
282+ vld1. 64 {q7} , [ buf ] !
283+ CPU_LE( vrev64. 8 q7 , q7 )
284+ vswp q7l , q7h
285+
286+ // XOR the first 16 data * bits * with the initial CRC value.
287+ vmov.i8 q0h , # 0
288+ vmov.u16 q0h [ 3 ], init_crc
289+ veor. 8 q7h , q7h , q0h
290+
291+ // Load the fold - across - 16 - bytes constants.
292+ vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
293+
294+ cmp len , # 16
295+ beq .Lreduce_final_16_bytes\@ // len == 16
296+ subs len , len , # 32
297+ addlt len , len , # 16
298+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
299+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
300+
301+ .Lreduce_final_16_bytes\@:
302+ .endm
303+
304+ //
305+ // u16 crc_t10dif_pmull(u16 init_crc , const u8 * buf , size_t len) ;
306+ //
307+ // Assumes len >= 16 .
308+ //
309+ ENTRY(crc_t10dif_pmull64)
310+ crct10dif p64
284311
285- .Lreduce_final_16_bytes:
286312 // Reduce the 128 - bit value M(x) , stored in q7 , to the final 16 - bit CRC.
287313
288314 // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
@@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 )
316342 vmov.u16 r0 , q0l [ 0 ]
317343 bx lr
318344
319- .Lless_than_256_bytes:
320- // Checksumming a buffer of length 16 ... 255 bytes
321-
322- mov_l fold_consts_ptr , .Lfold_across_16_bytes_consts
323-
324- // Load the first 16 data bytes.
325- vld1. 64 {q7} , [ buf ] !
326- CPU_LE( vrev64. 8 q7 , q7 )
327- vswp q7l , q7h
328-
329- // XOR the first 16 data * bits * with the initial CRC value.
330- vmov.i8 q0h , # 0
331- vmov.u16 q0h [ 3 ], init_crc
332- veor. 8 q7h , q7h , q0h
333-
334- // Load the fold - across - 16 - bytes constants.
335- vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
336-
337- cmp len , # 16
338- beq .Lreduce_final_16_bytes // len == 16
339- subs len , len , # 32
340- addlt len , len , # 16
341- blt .Lhandle_partial_segment // 17 <= len <= 31
342- b .Lfold_16_bytes_loop // 32 <= len <= 255
343- ENDPROC(crc_t10dif_pmull)
345+ ENDPROC(crc_t10dif_pmull64)
344346
345347 . section ".rodata" , "a"
346348 . align 4
0 commit comments