@@ -178,21 +178,78 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
178
178
let mut n: usize ;
179
179
180
180
asm ! (
181
- // search for a zero byte
181
+ // For small sizes, we avoid invoking SSE instructions.
182
+ // make manual comparisons instead.
182
183
"xor %eax, %eax" ,
184
+ "cmpb $0, (%rdi)" ,
185
+ "je 3f" ,
186
+ "mov $1, %eax" ,
187
+ "cmpb $0, 1(%rdi)" ,
188
+ "je 3f" ,
189
+ "mov $2, %eax" ,
190
+ "cmpb $0, 2(%rdi)" ,
191
+ "je 3f" ,
192
+ "mov $3, %eax" ,
193
+ "cmpb $0, 3(%rdi)" ,
194
+ "je 3f" ,
183
195
184
- // unbounded memory region
185
- "xor %ecx, %ecx" ,
186
- "not %rcx" ,
196
+ // Adjust address
197
+ "add $4, %rdi" ,
187
198
188
- // perform search
189
- "repne scasb (%rdi), %al" ,
199
+ // Align the address to 16 bytes (xmm register size).
200
+ // This is important, since an n byte read
201
+ // with n byte alignment is guranteed to never cross
202
+ // a page boundary and thus will never try to access
203
+ // memory which may not be accessible.
204
+ "mov %edi, %ecx" ,
205
+ "and $15, %ecx" ,
206
+ "and $-16, %rdi" ,
190
207
191
- // extract length
192
- "not %rcx" ,
193
- "dec %rcx" ,
208
+ // zero out an xmm register for comparisons with zero.
209
+ "pxor %xmm0, %xmm0" ,
210
+
211
+ // One manual iteration of a zero byte search.
212
+ // Ensuring proper alignment may cause us to read
213
+ // memory _before_ the actual string start.
214
+ // Thus, one separate iteration is needed to handle this special case.
215
+ "movdqa (%rdi), %xmm1" ,
216
+ "pcmpeqb %xmm0, %xmm1" ,
217
+ "pmovmskb %xmm1, %eax" ,
218
+ // Shift out comparisons that don't belong to the actual string.
219
+ "shr %cl, %eax" ,
220
+ // Check if there was a zero
221
+ "test %eax, %eax" ,
222
+ "jz 1f" ,
223
+
224
+ // A zero was found: calculate result and exit.
225
+ "bsf %eax, %eax" ,
226
+ "add $4, %eax" ,
227
+ "jmp 3f" ,
228
+
229
+ // No zero was found: prepare main loop.
230
+ "1:" ,
231
+ "add $16, %rdi" ,
232
+ "neg %rcx" ,
233
+ "add $4, %rcx" ,
234
+
235
+ // main loop
236
+ "2:" ,
237
+ "movdqa (%rdi), %xmm1" ,
238
+ "add $16, %rdi" ,
239
+ "add $16, %rcx" ,
240
+ "pcmpeqb %xmm0, %xmm1" ,
241
+ "pmovmskb %xmm1, %eax" ,
242
+ // Check if there was a zero
243
+ "test %eax, %eax" ,
244
+ "jz 2b" ,
245
+
246
+ // A zero was found: calculate result and exit.
247
+ "bsf %eax, %eax" ,
248
+ "add %rcx, %rax" ,
249
+ "3:" ,
194
250
inout( "rdi" ) s => _,
195
- out( "rcx" ) n,
251
+ out( "rax" ) n,
252
+ out( "rcx" ) _,
196
253
options( att_syntax, nostack) ,
197
254
) ;
198
255
0 commit comments