Skip to content

Commit 224aa40

Browse files
Merge branch 'main' into sn/nullable/format_pbm
2 parents f9ac662 + fff41c9 commit 224aa40

File tree

23 files changed

+735
-273
lines changed

23 files changed

+735
-273
lines changed

src/ImageSharp/Color/Color.cs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,13 +286,10 @@ public TPixel ToPixel<TPixel>()
286286
/// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
287287
/// </summary>
288288
/// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
289-
/// <param name="configuration">The configuration.</param>
290289
/// <param name="source">The source color span.</param>
291290
/// <param name="destination">The destination pixel span.</param>
292291
[MethodImpl(InliningOptions.ShortMethod)]
293-
#pragma warning disable RCS1163 // Unused parameter.
294-
public static void ToPixel<TPixel>(Configuration configuration, ReadOnlySpan<Color> source, Span<TPixel> destination)
295-
#pragma warning restore RCS1163 // Unused parameter.
292+
public static void ToPixel<TPixel>(ReadOnlySpan<Color> source, Span<TPixel> destination)
296293
where TPixel : unmanaged, IPixel<TPixel>
297294
{
298295
// TODO: Investigate bulk operations utilizing configuration parameter here.

src/ImageSharp/Compression/Zlib/Adler32.cs

Lines changed: 106 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Runtime.CompilerServices;
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
7+
using System.Runtime.Intrinsics.Arm;
78
using System.Runtime.Intrinsics.X86;
89

910
#pragma warning disable IDE0007 // Use implicit type
@@ -70,6 +71,11 @@ public static uint Calculate(uint adler, ReadOnlySpan<byte> buffer)
7071
return CalculateSse(adler, buffer);
7172
}
7273

74+
if (AdvSimd.IsSupported)
75+
{
76+
return CalculateArm(adler, buffer);
77+
}
78+
7379
return CalculateScalar(adler, buffer);
7480
}
7581

@@ -95,7 +101,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
95101
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
96102
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
97103
Vector128<byte> zero = Vector128<byte>.Zero;
98-
var ones = Vector128.Create((short)1);
104+
Vector128<short> ones = Vector128.Create((short)1);
99105

100106
while (blocks > 0)
101107
{
@@ -179,13 +185,13 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
179185
byte* localBufferPtr = bufferPtr;
180186

181187
Vector256<byte> zero = Vector256<byte>.Zero;
182-
var dot3v = Vector256.Create((short)1);
183-
var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
188+
Vector256<short> dot3v = Vector256.Create((short)1);
189+
Vector256<sbyte> dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
184190

185191
// Process n blocks of data. At most NMAX data bytes can be
186192
// processed before s2 must be reduced modulo BASE.
187-
var vs1 = Vector256.CreateScalar(s1);
188-
var vs2 = Vector256.CreateScalar(s2);
193+
Vector256<uint> vs1 = Vector256.CreateScalar(s1);
194+
Vector256<uint> vs2 = Vector256.CreateScalar(s2);
189195

190196
while (length >= 32)
191197
{
@@ -243,6 +249,100 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
243249
}
244250
}
245251

252+
// Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
253+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
254+
private static unsafe uint CalculateArm(uint adler, ReadOnlySpan<byte> buffer)
255+
{
256+
// Split Adler-32 into component sums.
257+
uint s1 = adler & 0xFFFF;
258+
uint s2 = (adler >> 16) & 0xFFFF;
259+
uint length = (uint)buffer.Length;
260+
261+
// Process the data in blocks.
262+
long blocks = length / BlockSize;
263+
length -= (uint)(blocks * BlockSize);
264+
fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer))
265+
{
266+
byte* localBufferPtr = bufferPtr;
267+
268+
while (blocks != 0)
269+
{
270+
uint n = NMAX / BlockSize;
271+
if (n > blocks)
272+
{
273+
n = (uint)blocks;
274+
}
275+
276+
blocks -= n;
277+
278+
// Process n blocks of data. At most nMax data bytes can be
279+
// processed before s2 must be reduced modulo Base.
280+
Vector128<uint> vs1 = Vector128<uint>.Zero;
281+
Vector128<uint> vs2 = vs1.WithElement(3, s1 * n);
282+
Vector128<ushort> vColumnSum1 = Vector128<ushort>.Zero;
283+
Vector128<ushort> vColumnSum2 = Vector128<ushort>.Zero;
284+
Vector128<ushort> vColumnSum3 = Vector128<ushort>.Zero;
285+
Vector128<ushort> vColumnSum4 = Vector128<ushort>.Zero;
286+
287+
do
288+
{
289+
// Load 32 input bytes.
290+
Vector128<ushort> bytes1 = AdvSimd.LoadVector128(localBufferPtr).AsUInt16();
291+
Vector128<ushort> bytes2 = AdvSimd.LoadVector128(localBufferPtr + 0x10).AsUInt16();
292+
293+
// Add previous block byte sum to v_s2.
294+
vs2 = AdvSimd.Add(vs2, vs1);
295+
296+
// Horizontally add the bytes for s1.
297+
vs1 = AdvSimd.AddPairwiseWideningAndAdd(
298+
vs1.AsUInt32(),
299+
AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1.AsByte()).AsUInt16(), bytes2.AsByte()));
300+
301+
// Vertically add the bytes for s2.
302+
vColumnSum1 = AdvSimd.AddWideningLower(vColumnSum1, bytes1.GetLower().AsByte());
303+
vColumnSum2 = AdvSimd.AddWideningLower(vColumnSum2, bytes1.GetUpper().AsByte());
304+
vColumnSum3 = AdvSimd.AddWideningLower(vColumnSum3, bytes2.GetLower().AsByte());
305+
vColumnSum4 = AdvSimd.AddWideningLower(vColumnSum4, bytes2.GetUpper().AsByte());
306+
307+
localBufferPtr += BlockSize;
308+
}
309+
while (--n > 0);
310+
311+
vs2 = AdvSimd.ShiftLeftLogical(vs2, 5);
312+
313+
// Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
314+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29));
315+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25));
316+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21));
317+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17));
318+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13));
319+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9));
320+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5));
321+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1));
322+
323+
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
324+
Vector64<uint> sum1 = AdvSimd.AddPairwise(vs1.GetLower(), vs1.GetUpper());
325+
Vector64<uint> sum2 = AdvSimd.AddPairwise(vs2.GetLower(), vs2.GetUpper());
326+
Vector64<uint> s1s2 = AdvSimd.AddPairwise(sum1, sum2);
327+
328+
// Store the results.
329+
s1 += AdvSimd.Extract(s1s2, 0);
330+
s2 += AdvSimd.Extract(s1s2, 1);
331+
332+
// Reduce.
333+
s1 %= BASE;
334+
s2 %= BASE;
335+
}
336+
337+
if (length > 0)
338+
{
339+
HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
340+
}
341+
342+
return s1 | (s2 << 16);
343+
}
344+
}
345+
246346
private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2)
247347
{
248348
if (length >= 16)
@@ -286,7 +386,6 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
286386
{
287387
uint s1 = adler & 0xFFFF;
288388
uint s2 = (adler >> 16) & 0xFFFF;
289-
uint k;
290389

291390
fixed (byte* bufferPtr = buffer)
292391
{
@@ -295,7 +394,7 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
295394

296395
while (length > 0)
297396
{
298-
k = length < NMAX ? length : NMAX;
397+
uint k = length < NMAX ? length : NMAX;
299398
length -= k;
300399

301400
while (k >= 16)

src/ImageSharp/Compression/Zlib/Crc32.cs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
77
using System.Runtime.Intrinsics.X86;
8+
using ArmCrc32 = System.Runtime.Intrinsics.Arm.Crc32;
89

910
namespace SixLabors.ImageSharp.Compression.Zlib;
1011

@@ -60,6 +61,16 @@ public static uint Calculate(uint crc, ReadOnlySpan<byte> buffer)
6061
return ~CalculateSse(~crc, buffer);
6162
}
6263

64+
if (ArmCrc32.Arm64.IsSupported)
65+
{
66+
return ~CalculateArm64(~crc, buffer);
67+
}
68+
69+
if (ArmCrc32.IsSupported)
70+
{
71+
return ~CalculateArm(~crc, buffer);
72+
}
73+
6374
return ~CalculateScalar(~crc, buffer);
6475
}
6576

@@ -187,6 +198,100 @@ private static unsafe uint CalculateSse(uint crc, ReadOnlySpan<byte> buffer)
187198
}
188199
}
189200

201+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
202+
private static unsafe uint CalculateArm(uint crc, ReadOnlySpan<byte> buffer)
203+
{
204+
fixed (byte* bufferPtr = buffer)
205+
{
206+
byte* localBufferPtr = bufferPtr;
207+
int len = buffer.Length;
208+
209+
while (len > 0 && ((ulong)localBufferPtr & 3) != 0)
210+
{
211+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
212+
len--;
213+
}
214+
215+
uint* intBufferPtr = (uint*)localBufferPtr;
216+
217+
while (len >= 8 * sizeof(uint))
218+
{
219+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
220+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
221+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
222+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
223+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
224+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
225+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
226+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
227+
len -= 8 * sizeof(uint);
228+
}
229+
230+
while (len >= sizeof(uint))
231+
{
232+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
233+
len -= sizeof(uint);
234+
}
235+
236+
localBufferPtr = (byte*)intBufferPtr;
237+
238+
while (len > 0)
239+
{
240+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
241+
len--;
242+
}
243+
244+
return crc;
245+
}
246+
}
247+
248+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
249+
private static unsafe uint CalculateArm64(uint crc, ReadOnlySpan<byte> buffer)
250+
{
251+
fixed (byte* bufferPtr = buffer)
252+
{
253+
byte* localBufferPtr = bufferPtr;
254+
int len = buffer.Length;
255+
256+
while (len > 0 && ((ulong)localBufferPtr & 7) != 0)
257+
{
258+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
259+
len--;
260+
}
261+
262+
ulong* longBufferPtr = (ulong*)localBufferPtr;
263+
264+
while (len >= 8 * sizeof(ulong))
265+
{
266+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
267+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
268+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
269+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
270+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
271+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
272+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
273+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
274+
len -= 8 * sizeof(ulong);
275+
}
276+
277+
while (len >= sizeof(ulong))
278+
{
279+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
280+
len -= sizeof(ulong);
281+
}
282+
283+
localBufferPtr = (byte*)longBufferPtr;
284+
285+
while (len > 0)
286+
{
287+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
288+
len--;
289+
}
290+
291+
return crc;
292+
}
293+
}
294+
190295
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
191296
private static uint CalculateScalar(uint crc, ReadOnlySpan<byte> buffer)
192297
{

0 commit comments

Comments
 (0)