Skip to content

Commit 7e4742d

Browse files
authored
Change implementation to SSE
1 parent 1a2f3b2 commit 7e4742d

File tree

1 file changed

+67
-10
lines changed

1 file changed

+67
-10
lines changed

src/mem/x86_64.rs

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -178,21 +178,78 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
178178
let mut n: usize;
179179

180180
asm!(
181-
// search for a zero byte
181+
// For small sizes, we avoid invoking SSE instructions.
182+
// make manual comparisons instead.
182183
"xor %eax, %eax",
184+
"cmpb $0, (%rdi)",
185+
"je 3f",
186+
"mov $1, %eax",
187+
"cmpb $0, 1(%rdi)",
188+
"je 3f",
189+
"mov $2, %eax",
190+
"cmpb $0, 2(%rdi)",
191+
"je 3f",
192+
"mov $3, %eax",
193+
"cmpb $0, 3(%rdi)",
194+
"je 3f",
183195

184-
// unbounded memory region
185-
"xor %ecx, %ecx",
186-
"not %rcx",
196+
// Adjust address
197+
"add $4, %rdi",
187198

188-
// perform search
189-
"repne scasb (%rdi), %al",
199+
// Align the address to 16 bytes (xmm register size).
200+
// This is important, since an n byte read
201+
// with n byte alignment is guranteed to never cross
202+
// a page boundary and thus will never try to access
203+
// memory which may not be accessible.
204+
"mov %edi, %ecx",
205+
"and $15, %ecx",
206+
"and $-16, %rdi",
190207

191-
// extract length
192-
"not %rcx",
193-
"dec %rcx",
208+
// zero out an xmm register for comparisons with zero.
209+
"pxor %xmm0, %xmm0",
210+
211+
// One manual iteration of a zero byte search.
212+
// Ensuring proper alignment may cause us to read
213+
// memory _before_ the actual string start.
214+
// Thus, one separate iteration is needed to handle this special case.
215+
"movdqa (%rdi), %xmm1",
216+
"pcmpeqb %xmm0, %xmm1",
217+
"pmovmskb %xmm1, %eax",
218+
// Shift out comparisons that don't belong to the actual string.
219+
"shr %cl, %eax",
220+
// Check if there was a zero
221+
"test %eax, %eax",
222+
"jz 1f",
223+
224+
// A zero was found: calculate result and exit.
225+
"bsf %eax, %eax",
226+
"add $4, %eax",
227+
"jmp 3f",
228+
229+
// No zero was found: prepare main loop.
230+
"1:",
231+
"add $16, %rdi",
232+
"neg %rcx",
233+
"add $4, %rcx",
234+
235+
// main loop
236+
"2:",
237+
"movdqa (%rdi), %xmm1",
238+
"add $16, %rdi",
239+
"add $16, %rcx",
240+
"pcmpeqb %xmm0, %xmm1",
241+
"pmovmskb %xmm1, %eax",
242+
// Check if there was a zero
243+
"test %eax, %eax",
244+
"jz 2b",
245+
246+
// A zero was found: calculate result and exit.
247+
"bsf %eax, %eax",
248+
"add %rcx, %rax",
249+
"3:",
194250
inout("rdi") s => _,
195-
out("rcx") n,
251+
out("rax") n,
252+
out("rcx") _,
196253
options(att_syntax, nostack),
197254
);
198255

0 commit comments

Comments
 (0)