Skip to content

Commit 510de48

Browse files
pablodelaramdcornu
authored andcommitted
crc: only prefetch data that will be consumed for non-VPCLMUL functions
Also, use only 2 prefetch instructions for 128B. Signed-off-by: Pablo de Lara <[email protected]>
1 parent 46b5272 commit 510de48

12 files changed

+863
-34
lines changed

crc/crc16_t10dif_01.asm

Lines changed: 79 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,14 @@ crc16_t10dif_01:
147147
; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
148148
; loop will fold 128B at a time until we have 128+y Bytes of buffer
149149

150+
%if fetch_dist != 0
151+
; check if there is at least 4kb (fetch distance) + 128b in the buffer
152+
cmp arg3, (fetch_dist + 128)
153+
jb _fold_128_B_loop
150154

151155
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
152-
_fold_128_B_loop:
153-
156+
align 16
157+
_fold_and_prefetch_128_B_loop:
154158
; update the buffer pointer
155159
add arg2, 128 ; buf += 128;
156160

@@ -170,7 +174,6 @@ _fold_128_B_loop:
170174
pxor xmm1, xmm12
171175
xorps xmm1, xmm13
172176

173-
PREFETCH [arg2+fetch_dist+32]
174177
movdqu xmm9, [arg2+16*2]
175178
movdqu xmm12, [arg2+16*3]
176179
pshufb xmm9, xmm11
@@ -202,7 +205,79 @@ _fold_128_B_loop:
202205
pxor xmm5, xmm12
203206
xorps xmm5, xmm13
204207

205-
PREFETCH [arg2+fetch_dist+96]
208+
movdqu xmm9, [arg2+16*6]
209+
movdqu xmm12, [arg2+16*7]
210+
pshufb xmm9, xmm11
211+
pshufb xmm12, xmm11
212+
movdqa xmm8, xmm6
213+
movdqa xmm13, xmm7
214+
pclmulqdq xmm6, xmm10, 0x0
215+
pclmulqdq xmm8, xmm10 , 0x11
216+
pclmulqdq xmm7, xmm10, 0x0
217+
pclmulqdq xmm13, xmm10 , 0x11
218+
pxor xmm6, xmm9
219+
xorps xmm6, xmm8
220+
pxor xmm7, xmm12
221+
xorps xmm7, xmm13
222+
223+
sub arg3, 128
224+
225+
; check if there is another 128B in the buffer to be able to fold
226+
cmp arg3, (fetch_dist + 128)
227+
jge _fold_and_prefetch_128_B_loop
228+
%endif ; fetch_dist != 0
229+
230+
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
231+
align 16
232+
_fold_128_B_loop:
233+
; update the buffer pointer
234+
add arg2, 128 ; buf += 128;
235+
236+
movdqu xmm9, [arg2+16*0]
237+
movdqu xmm12, [arg2+16*1]
238+
pshufb xmm9, xmm11
239+
pshufb xmm12, xmm11
240+
movdqa xmm8, xmm0
241+
movdqa xmm13, xmm1
242+
pclmulqdq xmm0, xmm10, 0x0
243+
pclmulqdq xmm8, xmm10 , 0x11
244+
pclmulqdq xmm1, xmm10, 0x0
245+
pclmulqdq xmm13, xmm10 , 0x11
246+
pxor xmm0, xmm9
247+
xorps xmm0, xmm8
248+
pxor xmm1, xmm12
249+
xorps xmm1, xmm13
250+
251+
movdqu xmm9, [arg2+16*2]
252+
movdqu xmm12, [arg2+16*3]
253+
pshufb xmm9, xmm11
254+
pshufb xmm12, xmm11
255+
movdqa xmm8, xmm2
256+
movdqa xmm13, xmm3
257+
pclmulqdq xmm2, xmm10, 0x0
258+
pclmulqdq xmm8, xmm10 , 0x11
259+
pclmulqdq xmm3, xmm10, 0x0
260+
pclmulqdq xmm13, xmm10 , 0x11
261+
pxor xmm2, xmm9
262+
xorps xmm2, xmm8
263+
pxor xmm3, xmm12
264+
xorps xmm3, xmm13
265+
266+
movdqu xmm9, [arg2+16*4]
267+
movdqu xmm12, [arg2+16*5]
268+
pshufb xmm9, xmm11
269+
pshufb xmm12, xmm11
270+
movdqa xmm8, xmm4
271+
movdqa xmm13, xmm5
272+
pclmulqdq xmm4, xmm10, 0x0
273+
pclmulqdq xmm8, xmm10 , 0x11
274+
pclmulqdq xmm5, xmm10, 0x0
275+
pclmulqdq xmm13, xmm10 , 0x11
276+
pxor xmm4, xmm9
277+
xorps xmm4, xmm8
278+
pxor xmm5, xmm12
279+
xorps xmm5, xmm13
280+
206281
movdqu xmm9, [arg2+16*6]
207282
movdqu xmm12, [arg2+16*7]
208283
pshufb xmm9, xmm11

crc/crc16_t10dif_02.asm

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,14 @@ crc16_t10dif_02:
148148
; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
149149
; loop will fold 128B at a time until we have 128+y Bytes of buffer
150150

151+
%if fetch_dist != 0
152+
; check if there is at least 4kb (fetch distance) + 128b in the buffer
153+
cmp arg3, (fetch_dist + 128)
154+
jb _fold_128_B_loop
151155

152156
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
153-
_fold_128_B_loop:
154-
157+
align 16
158+
_fold_and_prefetch_128_B_loop:
155159
; update the buffer pointer
156160
add arg2, 128 ; buf += 128;
157161

@@ -171,7 +175,6 @@ _fold_128_B_loop:
171175
vpxor xmm1, xmm12
172176
vxorps xmm1, xmm13
173177

174-
PREFETCH [arg2+fetch_dist+32]
175178
vmovdqu xmm9, [arg2+16*2]
176179
vmovdqu xmm12, [arg2+16*3]
177180
vpshufb xmm9, xmm11
@@ -203,7 +206,81 @@ _fold_128_B_loop:
203206
vpxor xmm5, xmm12
204207
vxorps xmm5, xmm13
205208

206-
PREFETCH [arg2+fetch_dist+96]
209+
vmovdqu xmm9, [arg2+16*6]
210+
vmovdqu xmm12, [arg2+16*7]
211+
vpshufb xmm9, xmm11
212+
vpshufb xmm12, xmm11
213+
vmovdqa xmm8, xmm6
214+
vmovdqa xmm13, xmm7
215+
vpclmulqdq xmm6, xmm10, 0x0
216+
vpclmulqdq xmm8, xmm10 , 0x11
217+
vpclmulqdq xmm7, xmm10, 0x0
218+
vpclmulqdq xmm13, xmm10 , 0x11
219+
vpxor xmm6, xmm9
220+
vxorps xmm6, xmm8
221+
vpxor xmm7, xmm12
222+
vxorps xmm7, xmm13
223+
224+
sub arg3, 128
225+
226+
; check if there is another 128B in the buffer to be able to fold
227+
cmp arg3, (fetch_dist + 128)
228+
jge _fold_and_prefetch_128_B_loop
229+
%endif ; fetch_dist != 0
230+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
231+
232+
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
233+
align 16
234+
_fold_128_B_loop:
235+
236+
; update the buffer pointer
237+
add arg2, 128 ; buf += 128;
238+
239+
vmovdqu xmm9, [arg2+16*0]
240+
vmovdqu xmm12, [arg2+16*1]
241+
vpshufb xmm9, xmm11
242+
vpshufb xmm12, xmm11
243+
vmovdqa xmm8, xmm0
244+
vmovdqa xmm13, xmm1
245+
vpclmulqdq xmm0, xmm10, 0x0
246+
vpclmulqdq xmm8, xmm10 , 0x11
247+
vpclmulqdq xmm1, xmm10, 0x0
248+
vpclmulqdq xmm13, xmm10 , 0x11
249+
vpxor xmm0, xmm9
250+
vxorps xmm0, xmm8
251+
vpxor xmm1, xmm12
252+
vxorps xmm1, xmm13
253+
254+
vmovdqu xmm9, [arg2+16*2]
255+
vmovdqu xmm12, [arg2+16*3]
256+
vpshufb xmm9, xmm11
257+
vpshufb xmm12, xmm11
258+
vmovdqa xmm8, xmm2
259+
vmovdqa xmm13, xmm3
260+
vpclmulqdq xmm2, xmm10, 0x0
261+
vpclmulqdq xmm8, xmm10 , 0x11
262+
vpclmulqdq xmm3, xmm10, 0x0
263+
vpclmulqdq xmm13, xmm10 , 0x11
264+
vpxor xmm2, xmm9
265+
vxorps xmm2, xmm8
266+
vpxor xmm3, xmm12
267+
vxorps xmm3, xmm13
268+
269+
vmovdqu xmm9, [arg2+16*4]
270+
vmovdqu xmm12, [arg2+16*5]
271+
vpshufb xmm9, xmm11
272+
vpshufb xmm12, xmm11
273+
vmovdqa xmm8, xmm4
274+
vmovdqa xmm13, xmm5
275+
vpclmulqdq xmm4, xmm10, 0x0
276+
vpclmulqdq xmm8, xmm10 , 0x11
277+
vpclmulqdq xmm5, xmm10, 0x0
278+
vpclmulqdq xmm13, xmm10 , 0x11
279+
vpxor xmm4, xmm9
280+
vxorps xmm4, xmm8
281+
vpxor xmm5, xmm12
282+
vxorps xmm5, xmm13
283+
207284
vmovdqu xmm9, [arg2+16*6]
208285
vmovdqu xmm12, [arg2+16*7]
209286
vpshufb xmm9, xmm11

crc/crc16_t10dif_by4.asm

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,15 @@ crc16_t10dif_by4:
130130
; buffer. The _fold_64_B_loop
131131
; loop will fold 64B at a time until we have 64+y Bytes of buffer
132132

133+
%if fetch_dist != 0
134+
; check if there is at least 4KB (fetch distance) + 64B in the buffer
135+
cmp arg3, (fetch_dist + 64)
136+
jb _fold_64_B_loop
133137

134138
; fold 64B at a time. This section of the code folds 4 xmm
135139
; registers in parallel
136-
_fold_64_B_loop:
140+
align 16
141+
_fold_and_prefetch_64_B_loop:
137142

138143
; update the buffer pointer
139144
add arg2, 64 ; buf += 64;
@@ -151,7 +156,61 @@ _fold_64_B_loop:
151156
pxor xmm0, xmm4
152157
pxor xmm1, xmm5
153158

154-
PREFETCH [arg2+fetch_dist+32]
159+
movdqu xmm4, xmm2
160+
movdqu xmm5, xmm3
161+
162+
pclmulqdq xmm2, xmm6, 0x11
163+
pclmulqdq xmm3, xmm6, 0x11
164+
165+
pclmulqdq xmm4, xmm6, 0x0
166+
pclmulqdq xmm5, xmm6, 0x0
167+
168+
pxor xmm2, xmm4
169+
pxor xmm3, xmm5
170+
171+
movdqu xmm4, [arg2]
172+
movdqu xmm5, [arg2+16]
173+
pshufb xmm4, xmm7
174+
pshufb xmm5, xmm7
175+
pxor xmm0, xmm4
176+
pxor xmm1, xmm5
177+
178+
movdqu xmm4, [arg2+32]
179+
movdqu xmm5, [arg2+48]
180+
pshufb xmm4, xmm7
181+
pshufb xmm5, xmm7
182+
183+
pxor xmm2, xmm4
184+
pxor xmm3, xmm5
185+
186+
sub arg3, 64
187+
188+
; check if there is another 64B in the buffer to be able to fold
189+
cmp arg3, (fetch_dist + 64)
190+
jge _fold_and_prefetch_64_B_loop
191+
%endif ; fetch_dist != 0
192+
193+
; fold 64B at a time. This section of the code folds 4 xmm
194+
; registers in parallel
195+
196+
align 16
197+
_fold_64_B_loop:
198+
199+
; update the buffer pointer
200+
add arg2, 64 ; buf += 64;
201+
202+
movdqu xmm4, xmm0
203+
movdqu xmm5, xmm1
204+
205+
pclmulqdq xmm0, xmm6 , 0x11
206+
pclmulqdq xmm1, xmm6 , 0x11
207+
208+
pclmulqdq xmm4, xmm6, 0x0
209+
pclmulqdq xmm5, xmm6, 0x0
210+
211+
pxor xmm0, xmm4
212+
pxor xmm1, xmm5
213+
155214
movdqu xmm4, xmm2
156215
movdqu xmm5, xmm3
157216

crc/crc16_t10dif_copy_by4.asm

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,15 @@ crc16_t10dif_copy_by4:
139139
; buffer. The _fold_64_B_loop
140140
; loop will fold 64B at a time until we have 64+y Bytes of buffer
141141

142+
%if fetch_dist != 0
143+
; check if there is at least 4KB (fetch distance) + 64B in the buffer
144+
cmp arg4, (fetch_dist + 64)
145+
jb _fold_64_B_loop
142146

143147
; fold 64B at a time. This section of the code folds 4 xmm
144148
; registers in parallel
145-
_fold_64_B_loop:
149+
align 16
150+
_fold_and_prefetch_64_B_loop:
146151

147152
; update the buffer pointer
148153
add arg3, 64 ; buf += 64;
@@ -161,7 +166,67 @@ _fold_64_B_loop:
161166
pxor xmm0, xmm4
162167
pxor xmm1, xmm5
163168

164-
PREFETCH [arg3+fetch_dist+32]
169+
movdqu xmm4, xmm2
170+
movdqu xmm5, xmm3
171+
172+
pclmulqdq xmm2, xmm6, 0x11
173+
pclmulqdq xmm3, xmm6, 0x11
174+
175+
pclmulqdq xmm4, xmm6, 0x0
176+
pclmulqdq xmm5, xmm6, 0x0
177+
178+
pxor xmm2, xmm4
179+
pxor xmm3, xmm5
180+
181+
movdqu xmm4, [arg3]
182+
movdqu xmm5, [arg3+16]
183+
movdqu [arg2], xmm4
184+
movdqu [arg2+16], xmm5
185+
pshufb xmm4, xmm7
186+
pshufb xmm5, xmm7
187+
pxor xmm0, xmm4
188+
pxor xmm1, xmm5
189+
190+
movdqu xmm4, [arg3+32]
191+
movdqu xmm5, [arg3+48]
192+
movdqu [arg2+32], xmm4
193+
movdqu [arg2+48], xmm5
194+
pshufb xmm4, xmm7
195+
pshufb xmm5, xmm7
196+
197+
pxor xmm2, xmm4
198+
pxor xmm3, xmm5
199+
200+
sub arg4, 64
201+
202+
; check if there is another 4KB (fetch distance) + 64B in the buffer
203+
cmp arg4, (fetch_dist + 64)
204+
jge _fold_and_prefetch_64_B_loop
205+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206+
207+
%endif ; fetch_dist != 0
208+
209+
; fold 64B at a time. This section of the code folds 4 xmm
210+
; registers in parallel
211+
align 16
212+
_fold_64_B_loop:
213+
214+
; update the buffer pointer
215+
add arg3, 64 ; buf += 64;
216+
add arg2, 64
217+
218+
movdqu xmm4, xmm0
219+
movdqu xmm5, xmm1
220+
221+
pclmulqdq xmm0, xmm6 , 0x11
222+
pclmulqdq xmm1, xmm6 , 0x11
223+
224+
pclmulqdq xmm4, xmm6, 0x0
225+
pclmulqdq xmm5, xmm6, 0x0
226+
227+
pxor xmm0, xmm4
228+
pxor xmm1, xmm5
229+
165230
movdqu xmm4, xmm2
166231
movdqu xmm5, xmm3
167232

0 commit comments

Comments
 (0)