Skip to content

Commit e677f66

Browse files
pablodelaramdcornu
authored andcommitted
crc: only prefetch data that will be consumed for VPCLMUL functions
Signed-off-by: Pablo de Lara <[email protected]>
1 parent 510de48 commit e677f66

6 files changed

+224
-12
lines changed

crc/crc16_t10dif_by16_10.asm

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,13 @@ FUNCTION_NAME:
145145
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
146146
sub arg3, 256
147147

148-
.fold_256_B_loop:
148+
%if fetch_dist != 0
149+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
150+
cmp arg3, (fetch_dist + 256)
151+
jb .fold_256_B_loop
152+
153+
align 16
154+
.fold_and_prefetch_256_B_loop:
149155
add arg2, 256
150156
PREFETCH [arg2+fetch_dist+0]
151157
vmovdqu8 zmm3, [arg2+16*0]
@@ -175,6 +181,39 @@ FUNCTION_NAME:
175181
vpclmulqdq zmm8, zmm8, zmm16, 0x11
176182
vpternlogq zmm8, zmm14, zmm17, 0x96
177183

184+
sub arg3, 256
185+
186+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
187+
cmp arg3, (fetch_dist + 256)
188+
jge .fold_and_prefetch_256_B_loop
189+
%endif ; fetch_dist != 0
190+
191+
.fold_256_B_loop:
192+
add arg2, 256
193+
vmovdqu8 zmm3, [arg2+16*0]
194+
vpshufb zmm3, zmm3, zmm18
195+
vpclmulqdq zmm1, zmm0, zmm16, 0x00
196+
vpclmulqdq zmm0, zmm0, zmm16, 0x11
197+
vpternlogq zmm0, zmm1, zmm3, 0x96
198+
199+
vmovdqu8 zmm9, [arg2+16*4]
200+
vpshufb zmm9, zmm9, zmm18
201+
vpclmulqdq zmm5, zmm4, zmm16, 0x00
202+
vpclmulqdq zmm4, zmm4, zmm16, 0x11
203+
vpternlogq zmm4, zmm5, zmm9, 0x96
204+
205+
vmovdqu8 zmm11, [arg2+16*8]
206+
vpshufb zmm11, zmm11, zmm18
207+
vpclmulqdq zmm12, zmm7, zmm16, 0x00
208+
vpclmulqdq zmm7, zmm7, zmm16, 0x11
209+
vpternlogq zmm7, zmm12, zmm11, 0x96
210+
211+
vmovdqu8 zmm17, [arg2+16*12]
212+
vpshufb zmm17, zmm17, zmm18
213+
vpclmulqdq zmm14, zmm8, zmm16, 0x00
214+
vpclmulqdq zmm8, zmm8, zmm16, 0x11
215+
vpternlogq zmm8, zmm14, zmm17, 0x96
216+
178217
sub arg3, 256
179218
jge .fold_256_B_loop
180219

crc/crc32_gzip_refl_by16_10.asm

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,13 @@ FUNCTION_NAME:
133133
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
134134
sub arg3, 256
135135

136+
%if fetch_dist != 0
137+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
138+
cmp arg3, (fetch_dist + 256)
139+
jb .fold_256_B_loop
140+
136141
align 16
137-
.fold_256_B_loop:
142+
.fold_and_prefetch_256_B_loop:
138143
add arg2, 256
139144
PREFETCH [arg2+fetch_dist+0]
140145
vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -156,6 +161,32 @@ align 16
156161
vpclmulqdq zmm8, zmm8, zmm16, 0x01
157162
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
158163

164+
sub arg3, 256
165+
166+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
167+
cmp arg3, (fetch_dist + 256)
168+
jge .fold_and_prefetch_256_B_loop
169+
%endif ; fetch_dist != 0
170+
171+
align 16
172+
.fold_256_B_loop:
173+
add arg2, 256
174+
vpclmulqdq zmm1, zmm0, zmm16, 0x10
175+
vpclmulqdq zmm0, zmm0, zmm16, 0x01
176+
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
177+
178+
vpclmulqdq zmm2, zmm4, zmm16, 0x10
179+
vpclmulqdq zmm4, zmm4, zmm16, 0x01
180+
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
181+
182+
vpclmulqdq zmm3, zmm7, zmm16, 0x10
183+
vpclmulqdq zmm7, zmm7, zmm16, 0x01
184+
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
185+
186+
vpclmulqdq zmm5, zmm8, zmm16, 0x10
187+
vpclmulqdq zmm8, zmm8, zmm16, 0x01
188+
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
189+
159190
sub arg3, 256
160191
jge .fold_256_B_loop
161192

crc/crc32_ieee_by16_10.asm

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,13 @@ FUNCTION_NAME:
133133
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
134134
sub arg3, 256
135135

136+
%if fetch_dist != 0
137+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
138+
cmp arg3, (fetch_dist + 256)
139+
jb .fold_256_B_loop
140+
136141
align 16
137-
.fold_256_B_loop:
142+
.fold_and_prefetch_256_B_loop:
138143
add arg2, 256
139144
vmovdqu8 zmm3, [arg2+16*0]
140145
PREFETCH [arg2+fetch_dist+0]
@@ -164,6 +169,40 @@ align 16
164169
vpclmulqdq zmm8, zmm8, zmm16, 0x11
165170
vpternlogq zmm8, zmm14, zmm17, 0x96
166171

172+
sub arg3, 256
173+
174+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
175+
cmp arg3, (fetch_dist + 256)
176+
jge .fold_and_prefetch_256_B_loop
177+
%endif ; fetch_dist != 0
178+
179+
align 16
180+
.fold_256_B_loop:
181+
add arg2, 256
182+
vmovdqu8 zmm3, [arg2+16*0]
183+
vpshufb zmm3, zmm3, zmm18
184+
vpclmulqdq zmm1, zmm0, zmm16, 0x00
185+
vpclmulqdq zmm0, zmm0, zmm16, 0x11
186+
vpternlogq zmm0, zmm1, zmm3, 0x96
187+
188+
vmovdqu8 zmm9, [arg2+16*4]
189+
vpshufb zmm9, zmm9, zmm18
190+
vpclmulqdq zmm5, zmm4, zmm16, 0x00
191+
vpclmulqdq zmm4, zmm4, zmm16, 0x11
192+
vpternlogq zmm4, zmm5, zmm9, 0x96
193+
194+
vmovdqu8 zmm11, [arg2+16*8]
195+
vpshufb zmm11, zmm11, zmm18
196+
vpclmulqdq zmm12, zmm7, zmm16, 0x00
197+
vpclmulqdq zmm7, zmm7, zmm16, 0x11
198+
vpternlogq zmm7, zmm12, zmm11, 0x96
199+
200+
vmovdqu8 zmm17, [arg2+16*12]
201+
vpshufb zmm17, zmm17, zmm18
202+
vpclmulqdq zmm14, zmm8, zmm16, 0x00
203+
vpclmulqdq zmm8, zmm8, zmm16, 0x11
204+
vpternlogq zmm8, zmm14, zmm17, 0x96
205+
167206
sub arg3, 256
168207
jge .fold_256_B_loop
169208

crc/crc32_iscsi_by16_10.asm

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,13 @@ FUNCTION_NAME:
122122
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
123123
sub arg3, 256
124124

125+
%if fetch_dist != 0
126+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
127+
cmp arg3, (fetch_dist + 256)
128+
jb .fold_256_B_loop
129+
125130
align 16
126-
.fold_256_B_loop:
131+
.fold_and_prefetch_256_B_loop:
127132
add arg2, 256
128133
PREFETCH [arg2+fetch_dist+0]
129134
vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -145,6 +150,32 @@ align 16
145150
vpclmulqdq zmm8, zmm8, zmm16, 0x01
146151
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
147152

153+
sub arg3, 256
154+
155+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
156+
cmp arg3, (fetch_dist + 256)
157+
jge .fold_and_prefetch_256_B_loop
158+
%endif ; fetch_dist != 0
159+
160+
align 16
161+
.fold_256_B_loop:
162+
add arg2, 256
163+
vpclmulqdq zmm1, zmm0, zmm16, 0x10
164+
vpclmulqdq zmm0, zmm0, zmm16, 0x01
165+
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
166+
167+
vpclmulqdq zmm2, zmm4, zmm16, 0x10
168+
vpclmulqdq zmm4, zmm4, zmm16, 0x01
169+
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
170+
171+
vpclmulqdq zmm3, zmm7, zmm16, 0x10
172+
vpclmulqdq zmm7, zmm7, zmm16, 0x01
173+
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
174+
175+
vpclmulqdq zmm5, zmm8, zmm16, 0x10
176+
vpclmulqdq zmm8, zmm8, zmm16, 0x01
177+
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
178+
148179
sub arg3, 256
149180
jge .fold_256_B_loop
150181

crc/crc64_iso_norm_by16_10.asm

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,30 +115,70 @@ FUNCTION_NAME:
115115
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
116116
sub arg3, 256
117117

118+
%if fetch_dist != 0
119+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
120+
cmp arg3, (fetch_dist + 256)
121+
jb _fold_256_B_loop
122+
123+
align 16
124+
_fold_and_prefetch_256_B_loop:
125+
add arg2, 256
126+
PREFETCH [arg2+fetch_dist+0]
127+
vmovdqu8 zmm3, [arg2+16*0]
128+
vpshufb zmm3, zmm3, zmm18
129+
vpclmulqdq zmm1, zmm0, zmm16, 0x00
130+
vpclmulqdq zmm0, zmm0, zmm16, 0x11
131+
vpternlogq zmm0, zmm1, zmm3, 0x96
132+
133+
PREFETCH [arg2+fetch_dist+64]
134+
vmovdqu8 zmm9, [arg2+16*4]
135+
vpshufb zmm9, zmm9, zmm18
136+
vpclmulqdq zmm5, zmm4, zmm16, 0x00
137+
vpclmulqdq zmm4, zmm4, zmm16, 0x11
138+
vpternlogq zmm4, zmm5, zmm9, 0x96
139+
140+
PREFETCH [arg2+fetch_dist+64*2]
141+
vmovdqu8 zmm11, [arg2+16*8]
142+
vpshufb zmm11, zmm11, zmm18
143+
vpclmulqdq zmm12, zmm7, zmm16, 0x00
144+
vpclmulqdq zmm7, zmm7, zmm16, 0x11
145+
vpternlogq zmm7, zmm12, zmm11, 0x96
146+
147+
PREFETCH [arg2+fetch_dist+64*3]
148+
vmovdqu8 zmm17, [arg2+16*12]
149+
vpshufb zmm17, zmm17, zmm18
150+
vpclmulqdq zmm14, zmm8, zmm16, 0x00
151+
vpclmulqdq zmm8, zmm8, zmm16, 0x11
152+
vpternlogq zmm8, zmm14, zmm17, 0x96
153+
154+
sub arg3, 256
155+
156+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
157+
cmp arg3, (fetch_dist + 256)
158+
jge _fold_and_prefetch_256_B_loop
159+
%endif
160+
161+
align 16
118162
_fold_256_B_loop:
119163
add arg2, 256
120-
PREFETCH [arg2+fetch_dist+0]
121164
vmovdqu8 zmm3, [arg2+16*0]
122165
vpshufb zmm3, zmm3, zmm18
123166
vpclmulqdq zmm1, zmm0, zmm16, 0x00
124167
vpclmulqdq zmm0, zmm0, zmm16, 0x11
125168
vpternlogq zmm0, zmm1, zmm3, 0x96
126169

127-
PREFETCH [arg2+fetch_dist+64]
128170
vmovdqu8 zmm9, [arg2+16*4]
129171
vpshufb zmm9, zmm9, zmm18
130172
vpclmulqdq zmm5, zmm4, zmm16, 0x00
131173
vpclmulqdq zmm4, zmm4, zmm16, 0x11
132174
vpternlogq zmm4, zmm5, zmm9, 0x96
133175

134-
PREFETCH [arg2+fetch_dist+64*2]
135176
vmovdqu8 zmm11, [arg2+16*8]
136177
vpshufb zmm11, zmm11, zmm18
137178
vpclmulqdq zmm12, zmm7, zmm16, 0x00
138179
vpclmulqdq zmm7, zmm7, zmm16, 0x11
139180
vpternlogq zmm7, zmm12, zmm11, 0x96
140181

141-
PREFETCH [arg2+fetch_dist+64*3]
142182
vmovdqu8 zmm17, [arg2+16*12]
143183
vpshufb zmm17, zmm17, zmm18
144184
vpclmulqdq zmm14, zmm8, zmm16, 0x00

crc/crc64_iso_refl_by16_10.asm

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,24 +116,56 @@ FUNCTION_NAME:
116116
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
117117
sub arg3, 256
118118

119+
%if fetch_dist != 0
120+
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
121+
cmp arg3, (fetch_dist + 256)
122+
jb _fold_256_B_loop
123+
124+
align 16
125+
_fold_and_prefetch_256_B_loop:
126+
add arg2, 256
127+
PREFETCH [arg2+fetch_dist+0]
128+
vpclmulqdq zmm1, zmm0, zmm16, 0x10
129+
vpclmulqdq zmm0, zmm0, zmm16, 0x01
130+
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
131+
132+
PREFETCH [arg2+fetch_dist+64]
133+
vpclmulqdq zmm2, zmm4, zmm16, 0x10
134+
vpclmulqdq zmm4, zmm4, zmm16, 0x01
135+
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
136+
137+
PREFETCH [arg2+fetch_dist+64*2]
138+
vpclmulqdq zmm3, zmm7, zmm16, 0x10
139+
vpclmulqdq zmm7, zmm7, zmm16, 0x01
140+
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
141+
142+
PREFETCH [arg2+fetch_dist+64*3]
143+
vpclmulqdq zmm5, zmm8, zmm16, 0x10
144+
vpclmulqdq zmm8, zmm8, zmm16, 0x01
145+
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
146+
147+
sub arg3, 256
148+
149+
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
150+
cmp arg3, (fetch_dist + 256)
151+
jge _fold_and_prefetch_256_B_loop
152+
%endif
153+
154+
align 16
119155
_fold_256_B_loop:
120156
add arg2, 256
121-
PREFETCH [arg2+fetch_dist+0]
122157
vpclmulqdq zmm1, zmm0, zmm16, 0x10
123158
vpclmulqdq zmm0, zmm0, zmm16, 0x01
124159
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
125160

126-
PREFETCH [arg2+fetch_dist+64]
127161
vpclmulqdq zmm2, zmm4, zmm16, 0x10
128162
vpclmulqdq zmm4, zmm4, zmm16, 0x01
129163
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
130164

131-
PREFETCH [arg2+fetch_dist+64*2]
132165
vpclmulqdq zmm3, zmm7, zmm16, 0x10
133166
vpclmulqdq zmm7, zmm7, zmm16, 0x01
134167
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
135168

136-
PREFETCH [arg2+fetch_dist+64*3]
137169
vpclmulqdq zmm5, zmm8, zmm16, 0x10
138170
vpclmulqdq zmm8, zmm8, zmm16, 0x01
139171
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96

0 commit comments

Comments
 (0)