Skip to content

Commit 4f75a4c

Browse files
committed
improving the performance in the case where ignorable characters are uncommon
1 parent bd23f79 commit 4f75a4c

File tree

4 files changed

+72
-7
lines changed

4 files changed

+72
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ fully reproducible.
3232

3333
| processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up |
3434
|:----------------|:------------------------|:-------------------|:-------------------|
35-
| Apple M2 processor (ARM, 3.5 Ghz) | 6.5 | 3.8 | 1.7 x |
35+
| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x |
3636
| AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x |
3737
| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x |
3838
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x |

benchmark/Benchmark.cs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22
using BenchmarkDotNet.Running;
33
using BenchmarkDotNet.Configs;
44
using BenchmarkDotNet.Reports;
5-
using BenchmarkDotNet.Filters;
65
using BenchmarkDotNet.Jobs;
76
using System.Text;
8-
using System.Runtime.InteropServices;
97
using BenchmarkDotNet.Columns;
10-
using System.Runtime.Intrinsics;
11-
using System.Runtime.Intrinsics.X86;
128

139
namespace SimdUnicodeBenchmarks
1410
{
@@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le
464460

465461
if (dataoutput.Length != lengths[i])
466462
{
467-
Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}");
463+
Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}");
468464
#pragma warning disable CA2201
469465
throw new Exception("Error");
470466
}

src/Base64.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64<T>(ReadOnlySpan<T> input)
1414
{
1515
return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input);
1616
}
17-
public static byte[] FromBase64String(string s) {
17+
public static byte[] FromBase64String(string s)
18+
{
1819
ReadOnlySpan<char> base64 = s.AsSpan();
1920
byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64<char>(base64)];
2021
int bytesConsumed = 0;

src/Base64ARM.cs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,74 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
219219
[MethodImpl(MethodImplOptions.AggressiveInlining)]
220220
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
221221
{
222+
223+
// if mask is a power of 2, we can use a simpler version
224+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
225+
{
226+
int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
227+
int pos = pos64 & 0xf;
228+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
229+
230+
231+
Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
232+
switch (pos64 >> 4)
233+
{
234+
case 3:
235+
{
236+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
237+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
238+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh);
239+
Vector128.Store(compressed, output + 0 * 16);
240+
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
241+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
242+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
243+
244+
}
245+
break;
246+
247+
case 2:
248+
{
249+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
250+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
251+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh);
252+
Vector128.Store(b.chunk0, output + 0 * 16);
253+
Vector128.Store(compressed, output + 1 * 16);
254+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
255+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
256+
257+
}
258+
break;
259+
260+
case 1:
261+
{
262+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
263+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
264+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
265+
Vector128.Store(b.chunk0, output + 0 * 16);
266+
Vector128.Store(b.chunk1, output + 1 * 16);
267+
Vector128.Store(compressed, output + 2 * 16);
268+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
269+
270+
}
271+
break;
272+
273+
case 0:
274+
{
275+
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
276+
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
277+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
278+
Vector128.Store(b.chunk0, output + 0 * 16);
279+
Vector128.Store(b.chunk1, output + 1 * 16);
280+
Vector128.Store(b.chunk2, output + 2 * 16);
281+
Vector128.Store(compressed, output + 3 * 16);
282+
}
283+
break;
284+
}
285+
286+
287+
return 63;
288+
289+
}
222290
ulong nmask = ~mask;
223291
Compress(b.chunk0, (ushort)mask, output, tablePtr);
224292
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);

0 commit comments

Comments
 (0)