Why does JIT save and restores xmm vector registers if there are more than 6 concurrently used ymm registers? #86944
-
The main question is why does JIT emit these stack <=> xmm6-9 register moves and only for them and also not for xmm0-5 and why does it bother to do this only with lower (xmm) part of ymm registers and just deletes the upper half? ; Benchmark.BenchmarkAdler32.VectorSum(Byte[])
sub rsp,48
vzeroupper
vmovaps [rsp+30],xmm6
vmovaps [rsp+20],xmm7
vmovaps [rsp+10],xmm8
vmovaps [rsp],xmm9
cmp [rdx],dl
lea rax,[rdx+10]
vxorps ymm0,ymm0,ymm0
mov ecx,80004038
mov rcx,[rcx]
vmovups ymm1,[rcx+10]
mov ecx,1
mov r8,[rdx]
mov r8d,[r8+4]
mov rdx,[rdx+r8-10]
cmp rdx,20
jl M00_L04
mov rcx,rdx
and rcx,0FFFFFFFFFFFFFFE0
add rcx,rax
mov r8,rdx
and r8,0FFFFFFFFFFFFFF00
add r8,rax
cmp rax,r8
jae M00_L01
M00_L00:
vmovups ymm2,[rax]
vmovups ymm3,[rax+20]
vmovups ymm4,[rax+40]
vmovups ymm5,[rax+60]
vmovups ymm6,[rax+80]
vmovups ymm7,[rax+0A0]
vmovups ymm8,[rax+0C0]
vmovups ymm9,[rax+0E0]
vpsadbw ymm2,ymm2,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm3,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm4,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm5,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm6,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm7,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm8,ymm1
vpaddq ymm0,ymm0,ymm2
vpsadbw ymm2,ymm9,ymm1
vpaddq ymm0,ymm0,ymm2
add rax,100
cmp rax,r8
jb short M00_L00
M00_L01:
cmp rax,rcx
jae short M00_L03
nop [rax+rax]
nop [rax+rax]
M00_L02:
vmovups ymm2,[rax]
vpsadbw ymm2,ymm2,ymm1
vpaddq ymm0,ymm0,ymm2
add rax,20
cmp rax,rcx
jb short M00_L02
M00_L03:
vmovaps ymm1,ymm0
vextracti128 xmm0,ymm0,1
vpaddq xmm0,xmm1,xmm0
vpextrq rcx,xmm0,0
vpextrq r8,xmm0,1
add rcx,r8
M00_L04:
and rdx,1F
add rdx,rax
cmp rax,rdx
jae short M00_L06
xchg ax,ax
M00_L05:
movzx r8d,byte ptr [rax]
add rcx,r8
inc rax
cmp rax,rdx
jb short M00_L05
M00_L06:
mov rax,rcx
vmovaps xmm6,[rsp+30]
vmovaps xmm7,[rsp+20]
vmovaps xmm8,[rsp+10]
vmovaps xmm9,[rsp]
vzeroupper
add rsp,48
ret
; Total bytes of code 369 POC: [Benchmark]
[ArgumentsSource(nameof(Data))]
[SkipLocalsInit]
public ulong VectorSum(byte[] data) {
ref byte dataRef = ref MemoryMarshal.GetArrayDataReference(data);
Vector256<ulong> vectorSum = Vector256<ulong>.Zero;
Vector256<byte> zero = Vector256.Create(zeroes);
ulong sum = 1;
nint native_length = Unsafe.Add(ref Unsafe.As<byte, nint>(ref dataRef), -1);
if (native_length >= Vector256<byte>.Count) {
ref byte end = ref Unsafe.Add(ref dataRef, native_length & ~(Vector256<byte>.Count - 1));
for (ref byte end_8_pairs = ref Unsafe.Add(ref dataRef, native_length & ~(Vector256<byte>.Count * 8 - 1));
Unsafe.IsAddressLessThan(ref dataRef, ref end_8_pairs);
dataRef = ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 8)) {
Vector256<byte> bytes = Vector256.LoadUnsafe(ref dataRef);
Vector256<byte> bytes2 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 1));
Vector256<byte> bytes3 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 2));
Vector256<byte> bytes4 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 3));
Vector256<byte> bytes5 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 4));
Vector256<byte> bytes6 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 5));
Vector256<byte> bytes7 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 6));
Vector256<byte> bytes8 = Vector256.LoadUnsafe(ref Unsafe.Add(ref dataRef, Vector256<byte>.Count * 7));
vectorSum += Avx2.SumAbsoluteDifferences(bytes, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes3, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes4, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes5, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes6, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes7, zero).AsUInt64();
vectorSum += Avx2.SumAbsoluteDifferences(bytes8, zero).AsUInt64();
}
while (Unsafe.IsAddressLessThan(ref dataRef, ref end)) {
var bytes = Vector256.LoadUnsafe(ref dataRef);
vectorSum += Avx2.SumAbsoluteDifferences(bytes, zero).AsUInt64();
dataRef = ref Unsafe.Add(ref dataRef, Vector256<byte>.Count);
}
sum = CustomSum256(vectorSum);
}
for (ref byte end = ref Unsafe.Add(ref dataRef, native_length & (Vector256<byte>.Count - 1));
Unsafe.IsAddressLessThan(ref dataRef, ref end);
dataRef = ref Unsafe.Add(ref dataRef, 1)) {
sum += dataRef;
}
return sum;
}
[SkipLocalsInit]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong CustomSum256(Vector256<ulong> vector) {
Vector128<ulong> temp = vector.GetLower() + vector.GetUpper();
return Sse41.X64.Extract(temp, 0) + Sse41.X64.Extract(temp, 1);
} BTW maybe I could open an issue but first I want to be sure that's not intended behavior. |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 1 reply
-
Every platform has an On Windows x64, When you use more than the number of volatile (caller save) registers, then the method needs to spill the non-volatile (callee save) values. Inversely if you have a non-inlined call you would also have to spill any values in volatile registers across the call-site. |
Beta Was this translation helpful? Give feedback.
Every platform has an
Application Binary Interface
(ABI) and that dictates which registers are callee vs caller save.On Windows x64,
XMM0-XMM5
are considered volatile across a call boundary and so must be preserved by thecaller
if the value needs to be saved.XMM6-XMM15
however are considered non-volatile and must be preserved by thecallee
if they are used. The upper bits of these registers are all volatile and must be preserved by thecaller
. On Unix x64, there are no non-volatile registers and all ofXMM0-XMM15
are consideredcaller
save.When you use more than the number of volatile (caller save) registers, then the method needs to spill the non-volatile (callee save) values. Invers…