52
52
# regular CRC code that does not interleave the CRC instructions.
53
53
#define SMALL_SIZE 200
54
54
55
- # unsigned int crc_pcl( const u8 *buffer, unsigned int len, unsigned int crc_init );
55
+ # u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
56
56
57
57
.text
58
- SYM_FUNC_START(crc_pcl)
59
- #define bufp %rdi
60
- #define bufp_d %edi
61
- #define len %esi
62
- #define crc_init %edx
63
- #define crc_init_q %rdx
58
+ SYM_FUNC_START(crc32c_x86_3way)
59
+ #define crc0 %edi
60
+ #define crc0_q %rdi
61
+ #define bufp %rsi
62
+ #define bufp_d %esi
63
+ #define len %rdx
64
+ #define len_dw %edx
64
65
#define n_misaligned %ecx /* overlaps chunk_bytes! */
65
66
#define n_misaligned_q %rcx
66
67
#define chunk_bytes %ecx /* overlaps n_misaligned! */
@@ -85,9 +86,9 @@ SYM_FUNC_START(crc_pcl)
85
86
.Ldo_align:
86
87
movq (bufp), %rax
87
88
add n_misaligned_q, bufp
88
- sub n_misaligned , len
89
+ sub n_misaligned_q , len
89
90
.Lalign_loop:
90
- crc32b %al , crc_init # compute crc32 of 1-byte
91
+ crc32b %al , crc0 # compute crc32 of 1-byte
91
92
shr $8 , %rax # get next byte
92
93
dec n_misaligned
93
94
jne .Lalign_loop
@@ -102,7 +103,7 @@ SYM_FUNC_START(crc_pcl)
102
103
103
104
.Lpartial_block:
104
105
# Compute floor(len / 24) to get num qwords to process from each lane.
105
- imul $2731 , len , %eax # 2731 = ceil(2^16 / 24)
106
+ imul $2731 , len_dw , %eax # 2731 = ceil(2^16 / 24)
106
107
shr $16 , %eax
107
108
jmp .Lcrc_3lanes
108
109
@@ -125,16 +126,16 @@ SYM_FUNC_START(crc_pcl)
125
126
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
126
127
# bookkeeping instructions, which can compete with crc32q for the ALUs.
127
128
.Lcrc_3lanes_4x_loop:
128
- crc32q (bufp), crc_init_q
129
+ crc32q (bufp), crc0_q
129
130
crc32q (bufp,chunk_bytes_q), crc1
130
131
crc32q (bufp,chunk_bytes_q,2 ), crc2
131
- crc32q 8 (bufp), crc_init_q
132
+ crc32q 8 (bufp), crc0_q
132
133
crc32q 8 (bufp,chunk_bytes_q), crc1
133
134
crc32q 8 (bufp,chunk_bytes_q,2 ), crc2
134
- crc32q 16 (bufp), crc_init_q
135
+ crc32q 16 (bufp), crc0_q
135
136
crc32q 16 (bufp,chunk_bytes_q), crc1
136
137
crc32q 16 (bufp,chunk_bytes_q,2 ), crc2
137
- crc32q 24 (bufp), crc_init_q
138
+ crc32q 24 (bufp), crc0_q
138
139
crc32q 24 (bufp,chunk_bytes_q), crc1
139
140
crc32q 24 (bufp,chunk_bytes_q,2 ), crc2
140
141
add $32 , bufp
@@ -146,15 +147,15 @@ SYM_FUNC_START(crc_pcl)
146
147
jz .Lcrc_3lanes_last_qword
147
148
148
149
.Lcrc_3lanes_1x_loop:
149
- crc32q (bufp), crc_init_q
150
+ crc32q (bufp), crc0_q
150
151
crc32q (bufp,chunk_bytes_q), crc1
151
152
crc32q (bufp,chunk_bytes_q,2 ), crc2
152
153
add $8 , bufp
153
154
dec %eax
154
155
jnz .Lcrc_3lanes_1x_loop
155
156
156
157
.Lcrc_3lanes_last_qword:
157
- crc32q (bufp), crc_init_q
158
+ crc32q (bufp), crc0_q
158
159
crc32q (bufp,chunk_bytes_q), crc1
159
160
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
160
161
@@ -165,9 +166,9 @@ SYM_FUNC_START(crc_pcl)
165
166
lea (K_table-8 )(%rip ), %rax # first entry is for idx 1
166
167
pmovzxdq (%rax ,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
167
168
lea (chunk_bytes,chunk_bytes,2 ), %eax # chunk_bytes * 3
168
- sub %eax , len # len -= chunk_bytes * 3
169
+ sub %rax , len # len -= chunk_bytes * 3
169
170
170
- movq crc_init_q , %xmm1 # CRC for block 1
171
+ movq crc0_q , %xmm1 # CRC for block 1
171
172
pclmulqdq $0x00 , %xmm0 , %xmm1 # Multiply by K2
172
173
173
174
movq crc1, %xmm2 # CRC for block 2
@@ -176,8 +177,8 @@ SYM_FUNC_START(crc_pcl)
176
177
pxor %xmm2 ,%xmm1
177
178
movq %xmm1 , %rax
178
179
xor (bufp,chunk_bytes_q,2 ), %rax
179
- mov crc2, crc_init_q
180
- crc32 %rax , crc_init_q
180
+ mov crc2, crc0_q
181
+ crc32 %rax , crc0_q
181
182
lea 8 (bufp,chunk_bytes_q,2 ), bufp
182
183
183
184
################################################################
@@ -193,34 +194,34 @@ SYM_FUNC_START(crc_pcl)
193
194
## 6) Process any remainder without interleaving:
194
195
#######################################################################
195
196
.Lsmall:
196
- test len, len
197
+ test len_dw, len_dw
197
198
jz .Ldone
198
- mov len , %eax
199
+ mov len_dw , %eax
199
200
shr $3 , %eax
200
201
jz .Ldo_dword
201
202
.Ldo_qwords:
202
- crc32q (bufp), crc_init_q
203
+ crc32q (bufp), crc0_q
203
204
add $8 , bufp
204
205
dec %eax
205
206
jnz .Ldo_qwords
206
207
.Ldo_dword:
207
- test $4 , len
208
+ test $4 , len_dw
208
209
jz .Ldo_word
209
- crc32l (bufp), crc_init
210
+ crc32l (bufp), crc0
210
211
add $4 , bufp
211
212
.Ldo_word:
212
- test $2 , len
213
+ test $2 , len_dw
213
214
jz .Ldo_byte
214
- crc32w (bufp), crc_init
215
+ crc32w (bufp), crc0
215
216
add $2 , bufp
216
217
.Ldo_byte:
217
- test $1 , len
218
+ test $1 , len_dw
218
219
jz .Ldone
219
- crc32b (bufp), crc_init
220
+ crc32b (bufp), crc0
220
221
.Ldone:
221
- mov crc_init , %eax
222
+ mov crc0 , %eax
222
223
RET
223
- SYM_FUNC_END(crc_pcl )
224
+ SYM_FUNC_END(crc32c_x86_3way )
224
225
225
226
.section .rodata, "a" , @progbits
226
227
################################################################
0 commit comments