Skip to content

Commit bfa1e1b

Browse files
authored
Merge pull request #944 from novak-as/perf-adler32
Use hardware-accelerated Adler32 computation.
2 parents b221a8c + bc5bede commit bfa1e1b

File tree

2 files changed

+263
-14
lines changed

2 files changed

+263
-14
lines changed

src/MySqlConnector/Protocol/Serialization/CompressedPayloadHandler.cs

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ private ValueTask<int> ReadBytesAsync(Memory<byte> buffer, ProtocolErrorBehavior
150150
var bytesRead = decompressingStream.Read(uncompressedData, 0, uncompressedLength);
151151
m_remainingData = new(uncompressedData, 0, bytesRead);
152152

153-
var checksum = ComputeAdler32Checksum(uncompressedData, 0, bytesRead);
153+
var checksum = Adler32.Calculate(uncompressedData, 0, (uint)bytesRead);
154+
154155
var adlerStartOffset = payloadReadBytes.Offset + payloadReadBytes.Count - 4;
155156
if (payloadReadBytes.Array[adlerStartOffset + 0] != ((checksum >> 24) & 0xFF) ||
156157
payloadReadBytes.Array[adlerStartOffset + 1] != ((checksum >> 16) & 0xFF) ||
@@ -194,8 +195,7 @@ private ValueTask<int> CompressAndWrite(ArraySegment<byte> remainingUncompressed
194195
using (var deflateStream = new DeflateStream(compressedStream, CompressionLevel.Optimal, leaveOpen: true))
195196
deflateStream.Write(remainingUncompressedData.Array!, remainingUncompressedData.Offset, remainingUncompressedBytes);
196197

197-
// write Adler-32 checksum to stream
198-
var checksum = ComputeAdler32Checksum(remainingUncompressedData.Array!, remainingUncompressedData.Offset, remainingUncompressedBytes);
198+
var checksum = Adler32.Calculate(remainingUncompressedData.Array!, (uint)remainingUncompressedData.Offset, (uint)remainingUncompressedBytes);
199199
compressedStream.WriteByte((byte) ((checksum >> 24) & 0xFF));
200200
compressedStream.WriteByte((byte) ((checksum >> 16) & 0xFF));
201201
compressedStream.WriteByte((byte) ((checksum >> 8) & 0xFF));
@@ -225,17 +225,6 @@ private ValueTask<int> CompressAndWrite(ArraySegment<byte> remainingUncompressed
225225
CompressAndWrite(remainingUncompressedData, ioBehavior));
226226
}
227227

228-
private static uint ComputeAdler32Checksum(byte[] data, int offset, int length)
229-
{
230-
int s1 = 1, s2 = 0;
231-
for (var i = 0; i < length; i++)
232-
{
233-
s1 = (s1 + data[offset + i]) % 65521;
234-
s2 = (s2 + s1) % 65521;
235-
}
236-
return (((uint) s2) << 16) | (uint) s1;
237-
}
238-
239228
// CompressedByteHandler implements IByteHandler and delegates reading bytes back to the CompressedPayloadHandler class.
240229
private sealed class CompressedByteHandler : IByteHandler
241230
{
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
// https://github.com/SixLabors/ImageSharp/blob/master/src/ImageSharp/Formats/Png/Zlib/Adler32.cs
4+
5+
6+
#if NETCOREAPP
7+
using System;
8+
9+
#if !NETCOREAPP2_1
10+
using System.Runtime.Intrinsics;
11+
using System.Runtime.Intrinsics.X86;
12+
#endif
13+
14+
#endif
15+
16+
#pragma warning disable IDE0007 // Use implicit type
17+
18+
namespace MySqlConnector.Utilities
19+
{
20+
/// <summary>
21+
/// Calculates the 32 bit Adler checksum of a given buffer according to
22+
/// RFC 1950. ZLIB Compressed Data Format Specification version 3.3)
23+
/// </summary>
24+
public static class Adler32
25+
{
26+
/// <summary>
27+
/// The default initial seed value of a Adler32 checksum calculation.
28+
/// </summary>
29+
public const uint SeedValue = 1U;
30+
31+
// Largest prime smaller than 65536
32+
private const uint BASE = 65521;
33+
34+
// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
35+
private const uint NMAX = 5552;
36+
37+
#if NETCOREAPP && !NETCOREAPP2_1
38+
private const int MinBufferSize = 64;
39+
40+
// The C# compiler emits this as a compile-time constant embedded in the PE file.
41+
private static ReadOnlySpan<byte> Tap1Tap2 => new byte[]
42+
{
43+
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, // tap1
44+
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 // tap2
45+
};
46+
#endif
47+
48+
/// <summary>
49+
/// Calculates the Adler32 checksum with the bytes taken from the span.
50+
/// </summary>
51+
/// <param name="buffer">The readonly span of bytes.</param>
52+
/// <param name="offset"></param>
53+
/// <param name="length"></param>
54+
/// <returns>The <see cref="uint"/>.</returns>
55+
#if NETCOREAPP
56+
public static uint Calculate(ReadOnlySpan<byte> buffer, uint offset, uint length)
57+
#else
58+
public static uint Calculate(byte[] buffer, uint offset, uint length)
59+
#endif
60+
{
61+
if (buffer.Length == 0)
62+
{
63+
return SeedValue;
64+
}
65+
66+
#if NETCOREAPP && !NETCOREAPP2_1
67+
if (Ssse3.IsSupported && buffer.Length >= MinBufferSize)
68+
{
69+
return CalculateSse(buffer, offset, length);
70+
}
71+
#endif
72+
73+
return CalculateScalar(buffer, offset, length);
74+
}
75+
76+
77+
#if NETCOREAPP && !NETCOREAPP2_1
78+
// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
79+
private static unsafe uint CalculateSse(ReadOnlySpan<byte> buffer, uint offset, uint length)
80+
{
81+
uint s1 = SeedValue & 0xFFFF;
82+
uint s2 = (SeedValue >> 16) & 0xFFFF;
83+
84+
// Process the data in blocks.
85+
const int BLOCK_SIZE = 1 << 5;
86+
87+
uint blocks = length / BLOCK_SIZE;
88+
length -= blocks * BLOCK_SIZE;
89+
90+
int index = 0;
91+
fixed (byte* bufferPtr = buffer)
92+
fixed (byte* tapPtr = Tap1Tap2)
93+
{
94+
index += (int)blocks * BLOCK_SIZE;
95+
var localBufferPtr = bufferPtr + offset;
96+
97+
// _mm_setr_epi8 on x86
98+
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
99+
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
100+
Vector128<byte> zero = Vector128<byte>.Zero;
101+
var ones = Vector128.Create((short)1);
102+
103+
while (blocks > 0)
104+
{
105+
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
106+
if (n > blocks)
107+
{
108+
n = blocks;
109+
}
110+
111+
blocks -= n;
112+
113+
// Process n blocks of data. At most NMAX data bytes can be
114+
// processed before s2 must be reduced modulo BASE.
115+
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
116+
Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
117+
Vector128<uint> v_s1 = Vector128<uint>.Zero;
118+
119+
do
120+
{
121+
// Load 32 input bytes.
122+
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
123+
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
124+
125+
// Add previous block byte sum to v_ps.
126+
v_ps = Sse2.Add(v_ps, v_s1);
127+
128+
// Horizontally add the bytes for s1, multiply-adds the
129+
// bytes by [ 32, 31, 30, ... ] for s2.
130+
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
131+
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
132+
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
133+
134+
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
135+
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
136+
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
137+
138+
localBufferPtr += BLOCK_SIZE;
139+
}
140+
while (--n > 0);
141+
142+
v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));
143+
144+
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
145+
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
146+
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
147+
148+
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
149+
150+
s1 += v_s1.ToScalar();
151+
152+
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
153+
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
154+
155+
s2 = v_s2.ToScalar();
156+
157+
// Reduce.
158+
s1 %= BASE;
159+
s2 %= BASE;
160+
}
161+
162+
if (length > 0)
163+
{
164+
if (length >= 16)
165+
{
166+
s2 += s1 += localBufferPtr[0];
167+
s2 += s1 += localBufferPtr[1];
168+
s2 += s1 += localBufferPtr[2];
169+
s2 += s1 += localBufferPtr[3];
170+
s2 += s1 += localBufferPtr[4];
171+
s2 += s1 += localBufferPtr[5];
172+
s2 += s1 += localBufferPtr[6];
173+
s2 += s1 += localBufferPtr[7];
174+
s2 += s1 += localBufferPtr[8];
175+
s2 += s1 += localBufferPtr[9];
176+
s2 += s1 += localBufferPtr[10];
177+
s2 += s1 += localBufferPtr[11];
178+
s2 += s1 += localBufferPtr[12];
179+
s2 += s1 += localBufferPtr[13];
180+
s2 += s1 += localBufferPtr[14];
181+
s2 += s1 += localBufferPtr[15];
182+
183+
localBufferPtr += 16;
184+
length -= 16;
185+
}
186+
187+
while (length-- > 0)
188+
{
189+
s2 += s1 += *localBufferPtr++;
190+
}
191+
192+
if (s1 >= BASE)
193+
{
194+
s1 -= BASE;
195+
}
196+
197+
s2 %= BASE;
198+
}
199+
200+
return s1 | (s2 << 16);
201+
}
202+
}
203+
#endif
204+
205+
#if NETCOREAPP
206+
private static unsafe uint CalculateScalar(ReadOnlySpan<byte> buffer, uint offset, uint length)
207+
#else
208+
private static unsafe uint CalculateScalar(byte[] buffer, uint offset, uint length)
209+
#endif
210+
{
211+
uint s1 = SeedValue & 0xFFFF;
212+
uint s2 = (SeedValue >> 16) & 0xFFFF;
213+
uint k;
214+
215+
fixed (byte* bufferPtr = buffer)
216+
{
217+
var localBufferPtr = bufferPtr + offset;
218+
219+
while (length > 0)
220+
{
221+
k = length < NMAX ? length : NMAX;
222+
length -= k;
223+
224+
while (k >= 16)
225+
{
226+
s2 += s1 += localBufferPtr[0];
227+
s2 += s1 += localBufferPtr[1];
228+
s2 += s1 += localBufferPtr[2];
229+
s2 += s1 += localBufferPtr[3];
230+
s2 += s1 += localBufferPtr[4];
231+
s2 += s1 += localBufferPtr[5];
232+
s2 += s1 += localBufferPtr[6];
233+
s2 += s1 += localBufferPtr[7];
234+
s2 += s1 += localBufferPtr[8];
235+
s2 += s1 += localBufferPtr[9];
236+
s2 += s1 += localBufferPtr[10];
237+
s2 += s1 += localBufferPtr[11];
238+
s2 += s1 += localBufferPtr[12];
239+
s2 += s1 += localBufferPtr[13];
240+
s2 += s1 += localBufferPtr[14];
241+
s2 += s1 += localBufferPtr[15];
242+
243+
localBufferPtr += 16;
244+
k -= 16;
245+
}
246+
247+
while (k-- > 0)
248+
{
249+
s2 += s1 += *localBufferPtr++;
250+
}
251+
252+
s1 %= BASE;
253+
s2 %= BASE;
254+
}
255+
256+
return (s2 << 16) | s1;
257+
}
258+
}
259+
}
260+
}

0 commit comments

Comments
 (0)