Skip to content

Commit b8572b5

Browse files
Merge pull request #2344 from SixLabors/js/png-arm
Add ARM Intrinsics for PNG filters
2 parents 4b4028b + b8a7614 commit b8572b5

File tree

9 files changed

+476
-38
lines changed

9 files changed

+476
-38
lines changed

src/ImageSharp/Compression/Zlib/Adler32.cs

Lines changed: 101 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Runtime.CompilerServices;
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
7+
using System.Runtime.Intrinsics.Arm;
78
using System.Runtime.Intrinsics.X86;
89

910
#pragma warning disable IDE0007 // Use implicit type
@@ -95,7 +96,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
9596
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
9697
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
9798
Vector128<byte> zero = Vector128<byte>.Zero;
98-
var ones = Vector128.Create((short)1);
99+
Vector128<short> ones = Vector128.Create((short)1);
99100

100101
while (blocks > 0)
101102
{
@@ -179,13 +180,13 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
179180
byte* localBufferPtr = bufferPtr;
180181

181182
Vector256<byte> zero = Vector256<byte>.Zero;
182-
var dot3v = Vector256.Create((short)1);
183-
var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
183+
Vector256<short> dot3v = Vector256.Create((short)1);
184+
Vector256<sbyte> dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
184185

185186
// Process n blocks of data. At most NMAX data bytes can be
186187
// processed before s2 must be reduced modulo BASE.
187-
var vs1 = Vector256.CreateScalar(s1);
188-
var vs2 = Vector256.CreateScalar(s2);
188+
Vector256<uint> vs1 = Vector256.CreateScalar(s1);
189+
Vector256<uint> vs2 = Vector256.CreateScalar(s2);
189190

190191
while (length >= 32)
191192
{
@@ -243,6 +244,100 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
243244
}
244245
}
245246

247+
// Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
248+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
249+
private static unsafe uint CalculateArm(uint adler, ReadOnlySpan<byte> buffer)
250+
{
251+
// Split Adler-32 into component sums.
252+
uint s1 = adler & 0xFFFF;
253+
uint s2 = (adler >> 16) & 0xFFFF;
254+
uint length = (uint)buffer.Length;
255+
256+
// Process the data in blocks.
257+
long blocks = length / BlockSize;
258+
length -= (uint)(blocks * BlockSize);
259+
fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer))
260+
{
261+
byte* localBufferPtr = bufferPtr;
262+
263+
while (blocks != 0)
264+
{
265+
uint n = NMAX / BlockSize;
266+
if (n > blocks)
267+
{
268+
n = (uint)blocks;
269+
}
270+
271+
blocks -= n;
272+
273+
// Process n blocks of data. At most nMax data bytes can be
274+
// processed before s2 must be reduced modulo Base.
275+
Vector128<uint> vs1 = Vector128<uint>.Zero;
276+
Vector128<uint> vs2 = vs1.WithElement(3, s1 * n);
277+
Vector128<ushort> vColumnSum1 = Vector128<ushort>.Zero;
278+
Vector128<ushort> vColumnSum2 = Vector128<ushort>.Zero;
279+
Vector128<ushort> vColumnSum3 = Vector128<ushort>.Zero;
280+
Vector128<ushort> vColumnSum4 = Vector128<ushort>.Zero;
281+
282+
do
283+
{
284+
// Load 32 input bytes.
285+
Vector128<ushort> bytes1 = AdvSimd.LoadVector128(localBufferPtr).AsUInt16();
286+
Vector128<ushort> bytes2 = AdvSimd.LoadVector128(localBufferPtr + 0x10).AsUInt16();
287+
288+
// Add previous block byte sum to v_s2.
289+
vs2 = AdvSimd.Add(vs2, vs1);
290+
291+
// Horizontally add the bytes for s1.
292+
vs1 = AdvSimd.AddPairwiseWideningAndAdd(
293+
vs1.AsUInt32(),
294+
AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1.AsByte()).AsUInt16(), bytes2.AsByte()));
295+
296+
// Vertically add the bytes for s2.
297+
vColumnSum1 = AdvSimd.AddWideningLower(vColumnSum1, bytes1.GetLower().AsByte());
298+
vColumnSum2 = AdvSimd.AddWideningLower(vColumnSum2, bytes1.GetUpper().AsByte());
299+
vColumnSum3 = AdvSimd.AddWideningLower(vColumnSum3, bytes2.GetLower().AsByte());
300+
vColumnSum4 = AdvSimd.AddWideningLower(vColumnSum4, bytes2.GetUpper().AsByte());
301+
302+
localBufferPtr += BlockSize;
303+
}
304+
while (--n > 0);
305+
306+
vs2 = AdvSimd.ShiftLeftLogical(vs2, 5);
307+
308+
// Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
309+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29));
310+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25));
311+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21));
312+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17));
313+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13));
314+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9));
315+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5));
316+
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1));
317+
318+
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
319+
Vector64<uint> sum1 = AdvSimd.AddPairwise(vs1.GetLower(), vs1.GetUpper());
320+
Vector64<uint> sum2 = AdvSimd.AddPairwise(vs2.GetLower(), vs2.GetUpper());
321+
Vector64<uint> s1s2 = AdvSimd.AddPairwise(sum1, sum2);
322+
323+
// Store the results.
324+
s1 += AdvSimd.Extract(s1s2, 0);
325+
s2 += AdvSimd.Extract(s1s2, 1);
326+
327+
// Reduce.
328+
s1 %= BASE;
329+
s2 %= BASE;
330+
}
331+
332+
if (length > 0)
333+
{
334+
HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
335+
}
336+
337+
return s1 | (s2 << 16);
338+
}
339+
}
340+
246341
private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2)
247342
{
248343
if (length >= 16)
@@ -286,7 +381,6 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
286381
{
287382
uint s1 = adler & 0xFFFF;
288383
uint s2 = (adler >> 16) & 0xFFFF;
289-
uint k;
290384

291385
fixed (byte* bufferPtr = buffer)
292386
{
@@ -295,7 +389,7 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
295389

296390
while (length > 0)
297391
{
298-
k = length < NMAX ? length : NMAX;
392+
uint k = length < NMAX ? length : NMAX;
299393
length -= k;
300394

301395
while (k >= 16)

src/ImageSharp/Compression/Zlib/Crc32.cs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
77
using System.Runtime.Intrinsics.X86;
8+
using ArmCrc32 = System.Runtime.Intrinsics.Arm.Crc32;
89

910
namespace SixLabors.ImageSharp.Compression.Zlib;
1011

@@ -187,6 +188,100 @@ private static unsafe uint CalculateSse(uint crc, ReadOnlySpan<byte> buffer)
187188
}
188189
}
189190

191+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
192+
private static unsafe uint CalculateArm(uint crc, ReadOnlySpan<byte> buffer)
193+
{
194+
fixed (byte* bufferPtr = buffer)
195+
{
196+
byte* localBufferPtr = bufferPtr;
197+
int len = buffer.Length;
198+
199+
while (len > 0 && ((ulong)localBufferPtr & 3) != 0)
200+
{
201+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
202+
len--;
203+
}
204+
205+
uint* intBufferPtr = (uint*)localBufferPtr;
206+
207+
while (len >= 8 * sizeof(uint))
208+
{
209+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
210+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
211+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
212+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
213+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
214+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
215+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
216+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
217+
len -= 8 * sizeof(uint);
218+
}
219+
220+
while (len >= sizeof(uint))
221+
{
222+
crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
223+
len -= sizeof(uint);
224+
}
225+
226+
localBufferPtr = (byte*)intBufferPtr;
227+
228+
while (len > 0)
229+
{
230+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
231+
len--;
232+
}
233+
234+
return crc;
235+
}
236+
}
237+
238+
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
239+
private static unsafe uint CalculateArm64(uint crc, ReadOnlySpan<byte> buffer)
240+
{
241+
fixed (byte* bufferPtr = buffer)
242+
{
243+
byte* localBufferPtr = bufferPtr;
244+
int len = buffer.Length;
245+
246+
while (len > 0 && ((ulong)localBufferPtr & 7) != 0)
247+
{
248+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
249+
len--;
250+
}
251+
252+
ulong* longBufferPtr = (ulong*)localBufferPtr;
253+
254+
while (len >= 8 * sizeof(ulong))
255+
{
256+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
257+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
258+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
259+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
260+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
261+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
262+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
263+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
264+
len -= 8 * sizeof(ulong);
265+
}
266+
267+
while (len >= sizeof(ulong))
268+
{
269+
crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
270+
len -= sizeof(ulong);
271+
}
272+
273+
localBufferPtr = (byte*)longBufferPtr;
274+
275+
while (len > 0)
276+
{
277+
crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
278+
len--;
279+
}
280+
281+
return crc;
282+
}
283+
}
284+
190285
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
191286
private static uint CalculateScalar(uint crc, ReadOnlySpan<byte> buffer)
192287
{

src/ImageSharp/Formats/Png/Filters/AverageFilter.cs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Runtime.CompilerServices;
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
7+
using System.Runtime.Intrinsics.Arm;
78
using System.Runtime.Intrinsics.X86;
89

910
namespace SixLabors.ImageSharp.Formats.Png.Filters;
@@ -35,6 +36,10 @@ public static void Decode(Span<byte> scanline, Span<byte> previousScanline, int
3536
{
3637
DecodeSse2(scanline, previousScanline);
3738
}
39+
else if (AdvSimd.IsSupported && bytesPerPixel is 4)
40+
{
41+
DecodeArm(scanline, previousScanline);
42+
}
3843
else
3944
{
4045
DecodeScalar(scanline, previousScanline, bytesPerPixel);
@@ -48,7 +53,7 @@ private static void DecodeSse2(Span<byte> scanline, Span<byte> previousScanline)
4853
ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
4954

5055
Vector128<byte> d = Vector128<byte>.Zero;
51-
var ones = Vector128.Create((byte)1);
56+
Vector128<byte> ones = Vector128.Create((byte)1);
5257

5358
int rb = scanline.Length;
5459
nint offset = 1;
@@ -75,6 +80,33 @@ private static void DecodeSse2(Span<byte> scanline, Span<byte> previousScanline)
7580
}
7681
}
7782

83+
public static void DecodeArm(Span<byte> scanline, Span<byte> previousScanline)
84+
{
85+
ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline);
86+
ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
87+
88+
Vector64<byte> d = Vector64<byte>.Zero;
89+
90+
int rb = scanline.Length;
91+
int offset = 1;
92+
const int bytesPerBatch = 4;
93+
while (rb >= bytesPerBatch)
94+
{
95+
ref byte scanRef = ref Unsafe.Add(ref scanBaseRef, offset);
96+
Vector64<byte> a = d;
97+
Vector64<byte> b = Vector64.CreateScalar(Unsafe.As<byte, int>(ref Unsafe.Add(ref prevBaseRef, offset))).AsByte();
98+
d = Vector64.CreateScalar(Unsafe.As<byte, int>(ref scanRef)).AsByte();
99+
100+
Vector64<byte> avg = AdvSimd.FusedAddHalving(a, b);
101+
d = AdvSimd.Add(d, avg);
102+
103+
Unsafe.As<byte, int>(ref scanRef) = d.AsInt32().ToScalar();
104+
105+
rb -= bytesPerBatch;
106+
offset += bytesPerBatch;
107+
}
108+
}
109+
78110
[MethodImpl(MethodImplOptions.AggressiveInlining)]
79111
private static void DecodeScalar(Span<byte> scanline, Span<byte> previousScanline, int bytesPerPixel)
80112
{

0 commit comments

Comments
 (0)