-
Notifications
You must be signed in to change notification settings - Fork 43
Description
I'm investigating memcpy speedups and am going to base my code on modifications to output from the compiler (as a baseline), I'm reading the output of this specific function:
void* memcpy_basic(void* s1, const void* s2, size_t sz)
{
for (size_t i = 0; i < sz; i++)
{
((char*)s1)[i] = ((const char*)s2)[i];
}
return s1;
}The assembly produced is:
[global _memcpy_basic]
; memcpy_basic
_memcpy_basic:
push ebx
push esi
push edi
L_1:
mov esi,dword [esp+0ch+0ch]
mov edx,dword [esp+08h+0ch]
mov eax,dword [esp+04h+0ch]
; Line 14: for (size_t i = 0; i < sz; i++)
L_10:
xor ebx,ebx
cmp ebx,esi
jnc L_6
L_4:
; Line 15: {
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
mov edi,ebx
mov cl,byte [edx+ebx]
mov byte [eax+ebx],cl
; Line 17: }
L_7:
mov ebx,edi
inc ebx
L_5:
cmp ebx,esi
jc L_4
L_6:
; Line 18: return s1;
; Line 19: }
L_2:
pop edi
pop esi
pop ebx
retFor some reason every loop iteration ebx needs to be saved against edi, then restored, but ebx is never used in such a way to warrant this.
Something with the loop optimizer isn't inspecting between the two blocks of L_4 and L_7 and noticing that EBX is only incremented and not used, so the extra movs are wasted.
I don't know where that is coming up, but when I dump the icds:
; Unoptimized
L_4:
DBG Block START
; Line 15: {
DBG Block START
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
T3.A = _s2:LINK(8).A
T2.UI = _i:LINK(0).UI
T4.I = T2.UI
T5.A = T3.A + T4.I
T6.C = *(T5).A.C
T7.A = _s1:LINK(4).A
T8.A = T7.A + T4.I
*(T8).A.C = T6.C
; Line 17: }
DBG Block END
DBG Block END
BLOCK 5
L_7:
T2.UI = _i:LINK(0).UI
T9.UI = T2.UI + #1.UI
_i:LINK(0).UI = T9.UI
DBG Block END
BLOCK 6
L_5:
T1.UI = _sz:LINK(12).UI
T2.UI = _i:LINK(0).UI
CONDGO L_4:PC ; T2.UI U< T1.UI
BLOCK 7
L_4:
DBG Block START
; Line 15: {
DBG Block START
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
T10(EDI).UI = T12(EBX).UI
T9(CL).C = *(T2(EDX) + T12(EBX)).A.C
*(T7(EAX) + T12(EBX)).A.C = T9(CL).C
; Line 17: }
DBG Block END
DBG Block END
BLOCK END
BLOCK 5
L_7:
T12(EBX).UI = T10(EDI).UI + #1.UI
DBG Block END
BLOCK END
BLOCK 6
L_5:
T13(EDI).UI = T1(ESI).UI
T14(ECX).UI = T12(EBX).UI
CONDGO L_4:PC ; T14(ECX).UI U< T13(EDI).UI
BLOCK END
BLOCK 7
To me, this looks like a redundancy pass wasn't had on that assignment and reassignment.
I have very little idea about the optimizer internals here, so I don't think I'd be too much of help outside of this identification of "What's going on over here?"